aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2016-07-23 20:41:05 +0000
committerDimitry Andric <dim@FreeBSD.org>2016-07-23 20:41:05 +0000
commit01095a5d43bbfde13731688ddcf6048ebb8b7721 (patch)
tree4def12e759965de927d963ac65840d663ef9d1ea /test/CodeGen/X86
parentf0f4822ed4b66e3579e92a89f368f8fb860e218e (diff)
Vendor import of llvm release_39 branch r276489:vendor/llvm/llvm-release_39-r276489
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/2006-05-02-InstrSched1.ll4
-rw-r--r--test/CodeGen/X86/2006-11-12-CSRetCC.ll9
-rw-r--r--test/CodeGen/X86/2007-08-10-SignExtSubreg.ll5
-rw-r--r--test/CodeGen/X86/2007-08-13-AppendingLinkage.ll12
-rw-r--r--test/CodeGen/X86/2007-10-15-CoalescerCrash.ll2
-rw-r--r--test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll6
-rw-r--r--test/CodeGen/X86/2008-07-19-movups-spills.ll64
-rw-r--r--test/CodeGen/X86/2008-07-22-CombinerCrash.ll4
-rw-r--r--test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll2
-rw-r--r--test/CodeGen/X86/2008-08-19-SubAndFetch.ll12
-rw-r--r--test/CodeGen/X86/2008-09-11-CoalescerBug2.ll4
-rw-r--r--test/CodeGen/X86/2008-09-29-ReMatBug.ll2
-rw-r--r--test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll9
-rw-r--r--test/CodeGen/X86/2009-03-05-burr-list-crash.ll2
-rw-r--r--test/CodeGen/X86/2009-10-16-Scope.ll5
-rw-r--r--test/CodeGen/X86/2010-01-18-DbgValue.ll5
-rw-r--r--test/CodeGen/X86/2010-02-01-DbgValueCrash.ll5
-rw-r--r--test/CodeGen/X86/2010-05-25-DotDebugLoc.ll5
-rw-r--r--test/CodeGen/X86/2010-05-26-DotDebugLoc.ll9
-rw-r--r--test/CodeGen/X86/2010-05-28-Crash.ll7
-rw-r--r--test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll9
-rw-r--r--test/CodeGen/X86/2010-07-06-DbgCrash.ll7
-rw-r--r--test/CodeGen/X86/2010-08-04-StackVariable.ll8
-rw-r--r--test/CodeGen/X86/2010-09-16-EmptyFilename.ll7
-rw-r--r--test/CodeGen/X86/2010-11-02-DbgParameter.ll5
-rw-r--r--test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll7
-rw-r--r--test/CodeGen/X86/2011-06-14-PreschedRegalias.ll2
-rw-r--r--test/CodeGen/X86/2011-09-14-valcoalesce.ll2
-rw-r--r--test/CodeGen/X86/2011-10-21-widen-cmp.ll2
-rw-r--r--test/CodeGen/X86/2012-01-11-split-cv.ll17
-rw-r--r--test/CodeGen/X86/2012-01-12-extract-sv.ll9
-rw-r--r--test/CodeGen/X86/2012-04-26-sdglue.ll36
-rw-r--r--test/CodeGen/X86/2012-1-10-buildvector.ll2
-rw-r--r--test/CodeGen/X86/2012-11-30-handlemove-dbg.ll5
-rw-r--r--test/CodeGen/X86/2012-11-30-misched-dbg.ll10
-rw-r--r--test/CodeGen/X86/2012-11-30-regpres-dbg.ll7
-rw-r--r--test/CodeGen/X86/3addr-16bit.ll8
-rw-r--r--test/CodeGen/X86/AppendingLinkage.ll4
-rw-r--r--test/CodeGen/X86/GC/dynamic-frame-size.ll2
-rw-r--r--test/CodeGen/X86/GC/erlang-gc.ll4
-rw-r--r--test/CodeGen/X86/GC/ocaml-gc.ll4
-rw-r--r--test/CodeGen/X86/MachineSink-DbgValue.ll9
-rw-r--r--test/CodeGen/X86/MergeConsecutiveStores.ll24
-rw-r--r--test/CodeGen/X86/StackColoring-dbg.ll5
-rw-r--r--test/CodeGen/X86/StackColoring.ll175
-rw-r--r--test/CodeGen/X86/WidenArith.ll22
-rw-r--r--test/CodeGen/X86/abi-isel.ll72
-rw-r--r--test/CodeGen/X86/add-nsw-sext.ll6
-rw-r--r--test/CodeGen/X86/add.ll36
-rw-r--r--test/CodeGen/X86/alias-gep.ll22
-rw-r--r--test/CodeGen/X86/aligned-variadic.ll4
-rw-r--r--test/CodeGen/X86/alignment.ll4
-rw-r--r--test/CodeGen/X86/all-ones-vector.ll139
-rw-r--r--test/CodeGen/X86/and-encoding.ll27
-rw-r--r--test/CodeGen/X86/anyext.ll39
-rw-r--r--test/CodeGen/X86/atom-lea-sp.ll4
-rw-r--r--test/CodeGen/X86/atomic-eflags-reuse.ll179
-rw-r--r--test/CodeGen/X86/atomic-non-integer.ll2
-rw-r--r--test/CodeGen/X86/atomic128.ll11
-rw-r--r--test/CodeGen/X86/atomic16.ll56
-rw-r--r--test/CodeGen/X86/atomic8.ll48
-rw-r--r--test/CodeGen/X86/atomic_mi.ll17
-rw-r--r--test/CodeGen/X86/avoid-loop-align.ll2
-rw-r--r--test/CodeGen/X86/avx-basic.ll85
-rw-r--r--test/CodeGen/X86/avx-cast.ll11
-rw-r--r--test/CodeGen/X86/avx-intel-ocl.ll7
-rw-r--r--test/CodeGen/X86/avx-intrinsics-fast-isel.ll3778
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll392
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll4341
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86_64.ll1
-rw-r--r--test/CodeGen/X86/avx-isa-check.ll114
-rw-r--r--test/CodeGen/X86/avx-select.ll29
-rw-r--r--test/CodeGen/X86/avx-shift.ll9
-rwxr-xr-xtest/CodeGen/X86/avx-shuffle-x86_32.ll2
-rw-r--r--test/CodeGen/X86/avx-splat.ll14
-rwxr-xr-xtest/CodeGen/X86/avx-trunc.ll37
-rw-r--r--test/CodeGen/X86/avx-vbroadcast.ll20
-rw-r--r--test/CodeGen/X86/avx-vbroadcastf128.ll111
-rw-r--r--test/CodeGen/X86/avx-vextractf128.ll91
-rw-r--r--test/CodeGen/X86/avx-vperm2x128.ll433
-rw-r--r--test/CodeGen/X86/avx-vzeroupper.ll5
-rwxr-xr-xtest/CodeGen/X86/avx2-conversions.ll6
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-fast-isel.ll3388
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll361
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86.ll1342
-rw-r--r--test/CodeGen/X86/avx2-logic.ll29
-rw-r--r--test/CodeGen/X86/avx2-nontemporal.ll65
-rw-r--r--test/CodeGen/X86/avx2-phaddsub.ll51
-rw-r--r--test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll110
-rw-r--r--test/CodeGen/X86/avx2-pmovxrm.ll201
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll13
-rw-r--r--test/CodeGen/X86/avx2-vbroadcasti128.ll129
-rw-r--r--test/CodeGen/X86/avx2-vector-shifts.ll445
-rwxr-xr-xtest/CodeGen/X86/avx2-vperm.ll27
-rw-r--r--test/CodeGen/X86/avx512-any_extend_load.ll70
-rw-r--r--test/CodeGen/X86/avx512-arith.ll142
-rw-r--r--test/CodeGen/X86/avx512-bugfix-23634.ll27
-rw-r--r--test/CodeGen/X86/avx512-bugfix-26264.ll47
-rw-r--r--test/CodeGen/X86/avx512-build-vector.ll7
-rw-r--r--test/CodeGen/X86/avx512-calling-conv.ll103
-rw-r--r--test/CodeGen/X86/avx512-cmp.ll130
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll665
-rw-r--r--test/CodeGen/X86/avx512-ext.ll713
-rw-r--r--test/CodeGen/X86/avx512-extract-subvector.ll290
-rw-r--r--test/CodeGen/X86/avx512-fma-intrinsics.ll307
-rw-r--r--test/CodeGen/X86/avx512-fma.ll35
-rw-r--r--test/CodeGen/X86/avx512-gather-scatter-intrin.ll28
-rw-r--r--test/CodeGen/X86/avx512-inc-dec.ll2
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll521
-rw-r--r--test/CodeGen/X86/avx512-intel-ocl.ll13
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-fast-isel.ll1134
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll1089
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll2817
-rw-r--r--test/CodeGen/X86/avx512-logic.ll66
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll2318
-rw-r--r--test/CodeGen/X86/avx512-mask-spills.ll126
-rw-r--r--test/CodeGen/X86/avx512-mov.ll376
-rw-r--r--test/CodeGen/X86/avx512-nontemporal.ll16
-rw-r--r--test/CodeGen/X86/avx512-scalarIntrinsics.ll66
-rw-r--r--test/CodeGen/X86/avx512-select.ll138
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll73
-rw-r--r--test/CodeGen/X86/avx512-trunc.ll15
-rw-r--r--test/CodeGen/X86/avx512-unsafe-fp-math.ll107
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll48
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll1073
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll413
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll538
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll905
-rw-r--r--test/CodeGen/X86/avx512bw-mask-op.ll107
-rw-r--r--test/CodeGen/X86/avx512bw-mov.ll185
-rw-r--r--test/CodeGen/X86/avx512bw-vec-cmp.ll113
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll244
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll629
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll4769
-rw-r--r--test/CodeGen/X86/avx512bwvl-mov.ll129
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-cmp.ll225
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll61
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics.ll277
-rw-r--r--test/CodeGen/X86/avx512dq-mask-op.ll55
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics.ll1530
-rw-r--r--test/CodeGen/X86/avx512ifma-intrinsics.ll105
-rw-r--r--test/CodeGen/X86/avx512ifmavl-intrinsics.ll226
-rw-r--r--test/CodeGen/X86/avx512vbmi-intrinsics.ll95
-rw-r--r--test/CodeGen/X86/avx512vbmivl-intrinsics.ll195
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll1391
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll2536
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll6995
-rw-r--r--test/CodeGen/X86/avx512vl-logic.ll48
-rw-r--r--test/CodeGen/X86/avx512vl-mov.ll517
-rw-r--r--test/CodeGen/X86/avx512vl-vbroadcast.ll175
-rw-r--r--test/CodeGen/X86/avx512vl-vec-cmp.ll301
-rw-r--r--test/CodeGen/X86/base-pointer-and-cmpxchg.ll51
-rw-r--r--test/CodeGen/X86/bit-piece-comment.ll5
-rw-r--r--test/CodeGen/X86/bitreverse.ll382
-rw-r--r--test/CodeGen/X86/block-placement.ll273
-rw-r--r--test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll165
-rw-r--r--test/CodeGen/X86/bmi-intrinsics-fast-isel.ll326
-rw-r--r--test/CodeGen/X86/bmi.ll555
-rw-r--r--test/CodeGen/X86/bool-zext.ll37
-rw-r--r--test/CodeGen/X86/br-fold.ll6
-rw-r--r--test/CodeGen/X86/break-false-dep.ll4
-rw-r--r--test/CodeGen/X86/bss_pagealigned.ll2
-rw-r--r--test/CodeGen/X86/bswap-vector.ll151
-rw-r--r--test/CodeGen/X86/bt.ll1225
-rw-r--r--test/CodeGen/X86/buildvec-insertvec.ll2
-rw-r--r--test/CodeGen/X86/byval2.ll4
-rw-r--r--test/CodeGen/X86/call-push.ll2
-rw-r--r--test/CodeGen/X86/catchpad-dynamic-alloca.ll65
-rw-r--r--test/CodeGen/X86/catchpad-lifetime.ll8
-rw-r--r--test/CodeGen/X86/catchret-regmask.ll73
-rw-r--r--test/CodeGen/X86/cfstring.ll2
-rw-r--r--test/CodeGen/X86/cleanuppad-inalloca.ll4
-rw-r--r--test/CodeGen/X86/cleanuppad-realign.ll4
-rw-r--r--test/CodeGen/X86/clear_upper_vector_element_bits.ll683
-rw-r--r--test/CodeGen/X86/clz.ll755
-rw-r--r--test/CodeGen/X86/cmov-into-branch.ll132
-rw-r--r--test/CodeGen/X86/cmov.ll4
-rw-r--r--test/CodeGen/X86/cmovcmov.ll6
-rw-r--r--test/CodeGen/X86/cmp.ll31
-rw-r--r--test/CodeGen/X86/cmpxchg-clobber-flags.ll12
-rw-r--r--test/CodeGen/X86/cmpxchg-i1.ll6
-rw-r--r--test/CodeGen/X86/cmpxchg-i128-i1.ll4
-rw-r--r--test/CodeGen/X86/coalescer-commute3.ll2
-rw-r--r--test/CodeGen/X86/code_placement_align_all.ll6
-rw-r--r--test/CodeGen/X86/code_placement_cold_loop_blocks.ll2
-rw-r--r--test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll2
-rw-r--r--test/CodeGen/X86/code_placement_loop_rotation.ll2
-rw-r--r--test/CodeGen/X86/code_placement_loop_rotation2.ll2
-rw-r--r--test/CodeGen/X86/code_placement_loop_rotation3.ll42
-rw-r--r--test/CodeGen/X86/code_placement_outline_optional_branches.ll2
-rw-r--r--test/CodeGen/X86/combine-multiplies.ll10
-rw-r--r--test/CodeGen/X86/combine-or.ll59
-rw-r--r--test/CodeGen/X86/combine-testm-and.ll57
-rw-r--r--test/CodeGen/X86/commute-blend-avx2.ll67
-rw-r--r--test/CodeGen/X86/commute-blend-sse41.ll27
-rw-r--r--test/CodeGen/X86/commute-fcmp.ll693
-rw-r--r--test/CodeGen/X86/constructor.ll12
-rw-r--r--test/CodeGen/X86/crash-lre-eliminate-dead-def.ll268
-rw-r--r--test/CodeGen/X86/ctpop-combine.ll38
-rw-r--r--test/CodeGen/X86/cxx_tlscc64.ll94
-rw-r--r--test/CodeGen/X86/dag-optnone.ll15
-rw-r--r--test/CodeGen/X86/darwin-stub.ll12
-rw-r--r--test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll253
-rw-r--r--test/CodeGen/X86/dbg-combine.ll5
-rw-r--r--test/CodeGen/X86/debugloc-argsize.ll5
-rw-r--r--test/CodeGen/X86/deopt-bundles.ll161
-rw-r--r--test/CodeGen/X86/deopt-intrinsic-cconv.ll34
-rw-r--r--test/CodeGen/X86/deopt-intrinsic.ll56
-rw-r--r--test/CodeGen/X86/dllexport-x86_64.ll31
-rw-r--r--test/CodeGen/X86/dllexport.ll18
-rw-r--r--test/CodeGen/X86/dwarf-comp-dir.ll2
-rw-r--r--test/CodeGen/X86/dynamic-alloca-in-entry.ll2
-rw-r--r--test/CodeGen/X86/dynamic-allocas-VLAs.ll12
-rw-r--r--test/CodeGen/X86/eflags-copy-expansion.mir67
-rw-r--r--test/CodeGen/X86/emutls-pic.ll26
-rw-r--r--test/CodeGen/X86/emutls-pie.ll21
-rw-r--r--test/CodeGen/X86/emutls_generic.ll46
-rw-r--r--test/CodeGen/X86/exedepsfix-broadcast.ll98
-rw-r--r--test/CodeGen/X86/expand-vr64-gr64-copy.mir2
-rw-r--r--test/CodeGen/X86/extractelement-index.ll643
-rw-r--r--test/CodeGen/X86/extractelement-load.ll91
-rw-r--r--test/CodeGen/X86/extractps.ll2
-rw-r--r--test/CodeGen/X86/f16c-intrinsics-fast-isel.ll132
-rw-r--r--test/CodeGen/X86/fast-isel-call.ll44
-rw-r--r--test/CodeGen/X86/fast-isel-cmp-branch2.ll5
-rw-r--r--test/CodeGen/X86/fast-isel-cmp-branch3.ll5
-rw-r--r--test/CodeGen/X86/fast-isel-float-half-convertion.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-nontemporal.ll1083
-rw-r--r--test/CodeGen/X86/fast-isel-stackcheck.ll8
-rw-r--r--test/CodeGen/X86/fast-isel-vecload.ll21
-rw-r--r--test/CodeGen/X86/fast-isel-x86-64.ll8
-rw-r--r--test/CodeGen/X86/fast-isel-x86.ll35
-rw-r--r--test/CodeGen/X86/fastmath-float-half-conversion.ll4
-rw-r--r--test/CodeGen/X86/fixup-bw-copy.ll71
-rw-r--r--test/CodeGen/X86/fixup-bw-copy.mir156
-rw-r--r--test/CodeGen/X86/fixup-bw-inst.ll126
-rw-r--r--test/CodeGen/X86/float-conv-elim.ll2
-rw-r--r--test/CodeGen/X86/fma_patterns.ll41
-rw-r--r--test/CodeGen/X86/fold-push.ll2
-rw-r--r--test/CodeGen/X86/fold-tied-op.ll5
-rw-r--r--test/CodeGen/X86/fold-vector-sext-zext.ll153
-rw-r--r--test/CodeGen/X86/force-align-stack-alloca.ll14
-rw-r--r--test/CodeGen/X86/fp-logic.ll48
-rw-r--r--test/CodeGen/X86/fp-une-cmp.ll122
-rw-r--r--test/CodeGen/X86/fp128-cast.ll267
-rw-r--r--test/CodeGen/X86/fp128-compare.ll33
-rw-r--r--test/CodeGen/X86/fp128-select.ll35
-rw-r--r--test/CodeGen/X86/fpstack-debuginstr-kill.ll7
-rw-r--r--test/CodeGen/X86/frame-order.ll122
-rw-r--r--test/CodeGen/X86/ga-offset.ll13
-rw-r--r--test/CodeGen/X86/ga-offset2.ll10
-rw-r--r--test/CodeGen/X86/global-access-pie.ll123
-rw-r--r--test/CodeGen/X86/global-sections.ll35
-rw-r--r--test/CodeGen/X86/h-registers-3.ll28
-rw-r--r--test/CodeGen/X86/haddsub-2.ll1017
-rw-r--r--test/CodeGen/X86/haddsub-undef.ll321
-rw-r--r--test/CodeGen/X86/haddsub.ll307
-rw-r--r--test/CodeGen/X86/half.ll71
-rw-r--r--test/CodeGen/X86/hipe-cc.ll20
-rw-r--r--test/CodeGen/X86/hipe-cc64.ll21
-rw-r--r--test/CodeGen/X86/hipe-prologue.ll13
-rw-r--r--test/CodeGen/X86/hoist-invariant-load.ll35
-rw-r--r--test/CodeGen/X86/hoist-spill-lpad.ll62
-rw-r--r--test/CodeGen/X86/hoist-spill.ll121
-rw-r--r--test/CodeGen/X86/i16lshr8pat.ll32
-rw-r--r--test/CodeGen/X86/i386-setjmp-pic.ll23
-rw-r--r--test/CodeGen/X86/i386-shrink-wrapping.ll8
-rw-r--r--test/CodeGen/X86/i386-tlscall-fastregalloc.ll11
-rw-r--r--test/CodeGen/X86/i686-win-shrink-wrapping.ll44
-rw-r--r--test/CodeGen/X86/ifunc-asm.ll15
-rw-r--r--test/CodeGen/X86/implicit-null-check.ll42
-rw-r--r--test/CodeGen/X86/implicit-null-checks.mir266
-rw-r--r--test/CodeGen/X86/inalloca-ctor.ll4
-rw-r--r--test/CodeGen/X86/inalloca-invoke.ll3
-rw-r--r--test/CodeGen/X86/inalloca-stdcall.ll4
-rw-r--r--test/CodeGen/X86/inalloca.ll12
-rw-r--r--test/CodeGen/X86/indirect-hidden.ll4
-rw-r--r--test/CodeGen/X86/insertelement-zero.ll264
-rw-r--r--test/CodeGen/X86/insertps-combine.ll159
-rw-r--r--test/CodeGen/X86/interval-update-remat.ll161
-rw-r--r--test/CodeGen/X86/ipra-inline-asm.ll20
-rw-r--r--test/CodeGen/X86/ipra-local-linkage.ll30
-rw-r--r--test/CodeGen/X86/ipra-reg-usage.ll12
-rw-r--r--test/CodeGen/X86/ipra-transform.ll32
-rw-r--r--test/CodeGen/X86/lakemont.ll9
-rw-r--r--test/CodeGen/X86/lea-opt-memop-check-1.ll99
-rw-r--r--test/CodeGen/X86/lea-opt-memop-check-2.ll21
-rw-r--r--test/CodeGen/X86/lea-opt.ll73
-rw-r--r--test/CodeGen/X86/libcall-sret.ll25
-rw-r--r--test/CodeGen/X86/licm-dominance.ll61
-rw-r--r--test/CodeGen/X86/licm-symbol.ll2
-rw-r--r--test/CodeGen/X86/loc-remat.ll55
-rw-r--r--test/CodeGen/X86/local_stack_symbol_ordering.ll184
-rw-r--r--test/CodeGen/X86/localescape.ll34
-rw-r--r--test/CodeGen/X86/lock-inst-encoding.ll44
-rw-r--r--test/CodeGen/X86/loop-blocks.ll28
-rw-r--r--test/CodeGen/X86/lsr-static-addr.ll2
-rw-r--r--test/CodeGen/X86/lzcnt-tzcnt.ll209
-rw-r--r--test/CodeGen/X86/machine-combiner-int.ll37
-rw-r--r--test/CodeGen/X86/machine-copy-prop.mir227
-rw-r--r--test/CodeGen/X86/machine-cp.ll2
-rw-r--r--test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll83
-rw-r--r--test/CodeGen/X86/machine-trace-metrics-crash.ll4
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll382
-rw-r--r--test/CodeGen/X86/masked_memop.ll10394
-rw-r--r--test/CodeGen/X86/materialize-one.ll100
-rw-r--r--test/CodeGen/X86/materialize.ll216
-rw-r--r--test/CodeGen/X86/mbp-false-cfg-break.ll39
-rw-r--r--test/CodeGen/X86/mcinst-lowering.ll15
-rw-r--r--test/CodeGen/X86/mcu-abi.ll59
-rw-r--r--test/CodeGen/X86/memcmp.ll18
-rw-r--r--test/CodeGen/X86/memcpy-from-string.ll24
-rw-r--r--test/CodeGen/X86/memset-2.ll52
-rw-r--r--test/CodeGen/X86/memset-nonzero.ll470
-rw-r--r--test/CodeGen/X86/memset64-on-x86-32.ll58
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-128.ll783
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-256.ll756
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-512.ll718
-rw-r--r--test/CodeGen/X86/merge-sp-update-lea.ll32
-rw-r--r--test/CodeGen/X86/merge-store-partially-alias-loads.ll4
-rw-r--r--test/CodeGen/X86/mfence.ll35
-rw-r--r--test/CodeGen/X86/mingw-alloca.ll4
-rw-r--r--test/CodeGen/X86/misched-aa-colored.ll1
-rw-r--r--test/CodeGen/X86/misched-code-difference-with-debug.ll15
-rw-r--r--test/CodeGen/X86/misched-ilp.ll4
-rw-r--r--test/CodeGen/X86/mmx-bitcast-fold.ll12
-rw-r--r--test/CodeGen/X86/movgs.ll9
-rw-r--r--test/CodeGen/X86/movmsk.ll75
-rw-r--r--test/CodeGen/X86/movpc32-check.ll5
-rw-r--r--test/CodeGen/X86/movtopush.ll112
-rw-r--r--test/CodeGen/X86/movtopush64.ll193
-rw-r--r--test/CodeGen/X86/mul-i256.ll27
-rw-r--r--test/CodeGen/X86/mul128.ll13
-rw-r--r--test/CodeGen/X86/mul64.ll25
-rw-r--r--test/CodeGen/X86/musttail-varargs.ll11
-rw-r--r--test/CodeGen/X86/mwaitx.ll38
-rw-r--r--test/CodeGen/X86/negate-add-zero.ll2
-rw-r--r--test/CodeGen/X86/negative-offset.ll18
-rw-r--r--test/CodeGen/X86/new-remat.ll70
-rw-r--r--test/CodeGen/X86/no-prolog-kill.ll21
-rw-r--r--test/CodeGen/X86/no-sse2-avg.ll32
-rw-r--r--test/CodeGen/X86/nontemporal-2.ll1122
-rw-r--r--test/CodeGen/X86/nontemporal-loads.ll1638
-rw-r--r--test/CodeGen/X86/nontemporal.ll137
-rw-r--r--test/CodeGen/X86/noreturn-call.ll48
-rw-r--r--test/CodeGen/X86/null-streamer.ll5
-rw-r--r--test/CodeGen/X86/opt-ext-uses.ll4
-rw-r--r--test/CodeGen/X86/or-lea.ll13
-rw-r--r--test/CodeGen/X86/osx-private-labels.ll24
-rw-r--r--test/CodeGen/X86/patchable-prologue.ll67
-rw-r--r--test/CodeGen/X86/patchpoint-verifiable.mir2
-rw-r--r--test/CodeGen/X86/peephole-na-phys-copy-folding.ll17
-rw-r--r--test/CodeGen/X86/phaddsub.ll173
-rw-r--r--test/CodeGen/X86/phi-immediate-factoring.ll3
-rw-r--r--test/CodeGen/X86/phys-reg-local-regalloc.ll6
-rw-r--r--test/CodeGen/X86/phys_subreg_coalesce-2.ll2
-rw-r--r--test/CodeGen/X86/pic.ll2
-rw-r--r--test/CodeGen/X86/pic_jumptable.ll2
-rw-r--r--test/CodeGen/X86/pie.ll45
-rw-r--r--test/CodeGen/X86/pku.ll2
-rw-r--r--test/CodeGen/X86/pmul.ll978
-rw-r--r--test/CodeGen/X86/pop-stack-cleanup.ll13
-rw-r--r--test/CodeGen/X86/popcnt.ll243
-rw-r--r--test/CodeGen/X86/post-ra-sched.ll40
-rw-r--r--test/CodeGen/X86/postra-licm.ll4
-rw-r--r--test/CodeGen/X86/powi.ll5
-rw-r--r--test/CodeGen/X86/pr15267.ll16
-rw-r--r--test/CodeGen/X86/pr16360.ll17
-rw-r--r--test/CodeGen/X86/pr17764.ll14
-rw-r--r--test/CodeGen/X86/pr23664.ll2
-rw-r--r--test/CodeGen/X86/pr2585.ll32
-rw-r--r--test/CodeGen/X86/pr26350.ll21
-rw-r--r--test/CodeGen/X86/pr2659.ll2
-rw-r--r--test/CodeGen/X86/pr26652.ll9
-rw-r--r--test/CodeGen/X86/pr26757.ll34
-rw-r--r--test/CodeGen/X86/pr26835.ll10
-rw-r--r--test/CodeGen/X86/pr26870.ll37
-rw-r--r--test/CodeGen/X86/pr27071.ll29
-rw-r--r--test/CodeGen/X86/pr27501.ll67
-rw-r--r--test/CodeGen/X86/pr27591.ll51
-rw-r--r--test/CodeGen/X86/pr27681.mir87
-rw-r--r--test/CodeGen/X86/pr28173.ll41
-rw-r--r--test/CodeGen/X86/pr28444.ll27
-rw-r--r--test/CodeGen/X86/pr28472.ll11
-rw-r--r--test/CodeGen/X86/pr28489.ll15
-rw-r--r--test/CodeGen/X86/pr28515.ll16
-rw-r--r--test/CodeGen/X86/pr28560.ll13
-rw-r--r--test/CodeGen/X86/pr5145.ll16
-rw-r--r--test/CodeGen/X86/promote-i16.ll16
-rw-r--r--test/CodeGen/X86/ps4-noreturn.ll38
-rw-r--r--test/CodeGen/X86/pshufb-mask-comments.ll66
-rw-r--r--test/CodeGen/X86/psubus.ll95
-rw-r--r--test/CodeGen/X86/push-cfi-debug.ll7
-rw-r--r--test/CodeGen/X86/push-cfi.ll2
-rw-r--r--test/CodeGen/X86/ragreedy-hoist-spill.ll10
-rw-r--r--test/CodeGen/X86/reduce-trunc-shl.ll28
-rw-r--r--test/CodeGen/X86/regalloc-reconcile-broken-hints.ll2
-rw-r--r--test/CodeGen/X86/rem.ll89
-rw-r--r--test/CodeGen/X86/rem_crash.ll3
-rw-r--r--test/CodeGen/X86/return-ext.ll138
-rw-r--r--test/CodeGen/X86/rtm.ll18
-rw-r--r--test/CodeGen/X86/sad.ll1001
-rw-r--r--test/CodeGen/X86/safestack_ssp.ll27
-rw-r--r--test/CodeGen/X86/segmented-stacks.ll34
-rw-r--r--test/CodeGen/X86/seh-catch-all-win32.ll8
-rw-r--r--test/CodeGen/X86/seh-safe-div-win32.ll4
-rw-r--r--test/CodeGen/X86/seh-safe-div.ll8
-rw-r--r--test/CodeGen/X86/seh-stack-realign.ll12
-rw-r--r--test/CodeGen/X86/setcc-lowering.ll4
-rw-r--r--test/CodeGen/X86/setcc-narrowing.ll4
-rw-r--r--test/CodeGen/X86/setcc.ll26
-rw-r--r--test/CodeGen/X86/sext-ret-val.ll12
-rw-r--r--test/CodeGen/X86/sext-setcc-self.ll87
-rw-r--r--test/CodeGen/X86/sext-trunc.ll11
-rw-r--r--test/CodeGen/X86/shift-pcmp.ll45
-rw-r--r--test/CodeGen/X86/shrink-wrap-chkstk.ll8
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll865
-rw-r--r--test/CodeGen/X86/sibcall-5.ll10
-rw-r--r--test/CodeGen/X86/sibcall-byval.ll6
-rw-r--r--test/CodeGen/X86/sincos-opt.ll30
-rw-r--r--test/CodeGen/X86/sink-blockfreq.ll4
-rw-r--r--test/CodeGen/X86/sink-cheap-instructions.ll2
-rw-r--r--test/CodeGen/X86/sjlj-eh.ll72
-rw-r--r--test/CodeGen/X86/slow-unaligned-mem.ll18
-rw-r--r--test/CodeGen/X86/sqrt-fastmath-mir.ll52
-rw-r--r--test/CodeGen/X86/sqrt-fastmath.ll29
-rw-r--r--test/CodeGen/X86/sse-intel-ocl.ll2
-rw-r--r--test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll35
-rw-r--r--test/CodeGen/X86/sse-intrinsics-fast-isel.ll2303
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll27
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86.ll449
-rw-r--r--test/CodeGen/X86/sse1.ll15
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll76
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel.ll3849
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll178
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86.ll937
-rw-r--r--test/CodeGen/X86/sse2.ll6
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub-2.ll2
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub.ll7
-rw-r--r--test/CodeGen/X86/sse3-intrinsics-fast-isel.ll13
-rw-r--r--test/CodeGen/X86/sse3.ll6
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-fast-isel.ll1008
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll228
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86.ll355
-rw-r--r--test/CodeGen/X86/sse41-pmovxrm.ll (renamed from test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll)37
-rw-r--r--test/CodeGen/X86/sse41.ll53
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll26
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-fast-isel.ll401
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-x86.ll197
-rw-r--r--test/CodeGen/X86/sse42.ll53
-rw-r--r--test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll20
-rw-r--r--test/CodeGen/X86/sse4a-upgrade.ll39
-rw-r--r--test/CodeGen/X86/sse4a.ll77
-rw-r--r--test/CodeGen/X86/sse_partial_update.ll84
-rw-r--r--test/CodeGen/X86/ssp-data-layout.ll2
-rw-r--r--test/CodeGen/X86/ssp-guard-spill.ll54
-rw-r--r--test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll28
-rw-r--r--test/CodeGen/X86/stack-align.ll28
-rw-r--r--test/CodeGen/X86/stack-align2.ll5
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx1.ll59
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx512vl.ll137
-rw-r--r--test/CodeGen/X86/stack-folding-fp-sse42.ll18
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx2.ll53
-rw-r--r--test/CodeGen/X86/stack-folding-xop.ll40
-rw-r--r--test/CodeGen/X86/stack-protector-dbginfo.ll11
-rw-r--r--test/CodeGen/X86/stack-protector-msvc.ll40
-rw-r--r--test/CodeGen/X86/stack-protector-target.ll27
-rw-r--r--test/CodeGen/X86/stack-protector-weight.ll40
-rw-r--r--test/CodeGen/X86/stack-protector.ll462
-rw-r--r--test/CodeGen/X86/stack_guard_remat.ll2
-rw-r--r--test/CodeGen/X86/stackguard-internal.ll15
-rw-r--r--test/CodeGen/X86/stackmap-frame-setup.ll4
-rw-r--r--test/CodeGen/X86/stackmap-large-constants.ll2
-rw-r--r--test/CodeGen/X86/stackmap-liveness.ll16
-rw-r--r--test/CodeGen/X86/statepoint-allocas.ll4
-rw-r--r--test/CodeGen/X86/statepoint-invoke.ll4
-rw-r--r--test/CodeGen/X86/statepoint-stack-usage.ll2
-rw-r--r--test/CodeGen/X86/statepoint-stackmap-format.ll10
-rw-r--r--test/CodeGen/X86/statepoint-uniqueing.ll31
-rw-r--r--test/CodeGen/X86/statepoint-vector-bad-spill.ll39
-rw-r--r--test/CodeGen/X86/statepoint-vector.ll2
-rw-r--r--test/CodeGen/X86/stdarg.ll2
-rw-r--r--test/CodeGen/X86/store-narrow.ll12
-rw-r--r--test/CodeGen/X86/store-zero-and-minus-one.ll88
-rw-r--r--test/CodeGen/X86/swift-return.ll206
-rw-r--r--test/CodeGen/X86/swifterror.ll359
-rw-r--r--test/CodeGen/X86/swiftself.ll62
-rw-r--r--test/CodeGen/X86/switch-bt.ll2
-rw-r--r--test/CodeGen/X86/switch-density.ll81
-rw-r--r--test/CodeGen/X86/switch-edge-weight.ll12
-rw-r--r--test/CodeGen/X86/switch-jump-table.ll2
-rw-r--r--test/CodeGen/X86/switch.ll77
-rw-r--r--test/CodeGen/X86/tail-call-attrs.ll4
-rw-r--r--test/CodeGen/X86/tail-call-casts.ll27
-rw-r--r--test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll40
-rw-r--r--test/CodeGen/X86/tail-merge-unreachable.ll34
-rw-r--r--test/CodeGen/X86/tail-opts.ll2
-rw-r--r--test/CodeGen/X86/tailcall-stackalign.ll2
-rw-r--r--test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll133
-rw-r--r--test/CodeGen/X86/tbm-intrinsics-fast-isel.ll216
-rw-r--r--test/CodeGen/X86/tls-android.ll8
-rw-r--r--test/CodeGen/X86/tls-pie.ll8
-rw-r--r--test/CodeGen/X86/tls-windows-itanium.ll30
-rw-r--r--test/CodeGen/X86/tls.ll15
-rw-r--r--test/CodeGen/X86/trunc-to-bool.ll4
-rw-r--r--test/CodeGen/X86/twoaddr-coalesce.ll2
-rw-r--r--test/CodeGen/X86/uint_to_fp-2.ll13
-rw-r--r--test/CodeGen/X86/uint_to_fp.ll33
-rw-r--r--test/CodeGen/X86/umul-with-overflow.ll3
-rw-r--r--test/CodeGen/X86/unaligned-load.ll4
-rw-r--r--test/CodeGen/X86/unaligned-spill-folding.ll2
-rw-r--r--test/CodeGen/X86/unknown-location.ll5
-rw-r--r--test/CodeGen/X86/unreachableblockelim.ll21
-rw-r--r--test/CodeGen/X86/unused_stackslots.ll246
-rw-r--r--test/CodeGen/X86/update-terminator.mir57
-rw-r--r--test/CodeGen/X86/urem-i8-constant.ll21
-rw-r--r--test/CodeGen/X86/urem-power-of-two.ll82
-rw-r--r--test/CodeGen/X86/utf16-cfstrings.ll2
-rw-r--r--test/CodeGen/X86/v4f32-immediate.ll15
-rw-r--r--test/CodeGen/X86/v8i1-masks.ll70
-rw-r--r--test/CodeGen/X86/vararg-callee-cleanup.ll2
-rw-r--r--test/CodeGen/X86/vec-sign.ll30
-rw-r--r--test/CodeGen/X86/vec_compare-sse4.ll81
-rw-r--r--test/CodeGen/X86/vec_ctbits.ll10
-rw-r--r--test/CodeGen/X86/vec_ext_inreg.ll74
-rw-r--r--test/CodeGen/X86/vec_extract-avx.ll177
-rw-r--r--test/CodeGen/X86/vec_extract-mmx.ll147
-rw-r--r--test/CodeGen/X86/vec_extract-sse4.ll115
-rw-r--r--test/CodeGen/X86/vec_extract.ll142
-rw-r--r--test/CodeGen/X86/vec_fabs.ll91
-rw-r--r--test/CodeGen/X86/vec_floor.ll361
-rw-r--r--test/CodeGen/X86/vec_fneg.ll99
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll1
-rw-r--r--test/CodeGen/X86/vec_fpext.ll152
-rw-r--r--test/CodeGen/X86/vec_fptrunc.ll168
-rw-r--r--test/CodeGen/X86/vec_i64.ll43
-rw-r--r--test/CodeGen/X86/vec_ins_extract-1.ll87
-rw-r--r--test/CodeGen/X86/vec_ins_extract.ll3
-rw-r--r--test/CodeGen/X86/vec_insert-2.ll60
-rw-r--r--test/CodeGen/X86/vec_insert-3.ll23
-rw-r--r--test/CodeGen/X86/vec_insert-4.ll43
-rw-r--r--test/CodeGen/X86/vec_insert-5.ll166
-rw-r--r--test/CodeGen/X86/vec_insert-7.ll47
-rw-r--r--test/CodeGen/X86/vec_insert-8.ll57
-rw-r--r--test/CodeGen/X86/vec_insert-9.ll22
-rw-r--r--test/CodeGen/X86/vec_insert-mmx.ll90
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll1697
-rw-r--r--test/CodeGen/X86/vec_loadsingles.ll85
-rw-r--r--test/CodeGen/X86/vec_logical.ll99
-rw-r--r--test/CodeGen/X86/vec_partial.ll21
-rw-r--r--test/CodeGen/X86/vec_sdiv_to_shift.ll285
-rw-r--r--test/CodeGen/X86/vec_set-2.ll34
-rw-r--r--test/CodeGen/X86/vec_set-3.ll17
-rw-r--r--test/CodeGen/X86/vec_set-4.ll46
-rw-r--r--test/CodeGen/X86/vec_set-6.ll20
-rw-r--r--test/CodeGen/X86/vec_set-7.ll20
-rw-r--r--test/CodeGen/X86/vec_set-8.ll19
-rw-r--r--test/CodeGen/X86/vec_set-A.ll13
-rw-r--r--test/CodeGen/X86/vec_set-B.ll32
-rw-r--r--test/CodeGen/X86/vec_set-C.ll19
-rw-r--r--test/CodeGen/X86/vec_set-D.ll15
-rw-r--r--test/CodeGen/X86/vec_set-F.ll34
-rw-r--r--test/CodeGen/X86/vec_set-H.ll30
-rw-r--r--test/CodeGen/X86/vec_set.ll43
-rw-r--r--test/CodeGen/X86/vec_setcc.ll242
-rw-r--r--test/CodeGen/X86/vec_shift.ll41
-rw-r--r--test/CodeGen/X86/vec_shift2.ll34
-rw-r--r--test/CodeGen/X86/vec_shift3.ll41
-rw-r--r--test/CodeGen/X86/vec_shift4.ll64
-rw-r--r--test/CodeGen/X86/vec_shift5.ll217
-rw-r--r--test/CodeGen/X86/vec_shift6.ll225
-rw-r--r--test/CodeGen/X86/vec_shift7.ll23
-rw-r--r--test/CodeGen/X86/vec_ss_load_fold.ll109
-rw-r--r--test/CodeGen/X86/vec_uint_to_fp-fastmath.ll4
-rw-r--r--test/CodeGen/X86/vector-bitreverse.ll3772
-rw-r--r--test/CodeGen/X86/vector-blend.ll285
-rw-r--r--test/CodeGen/X86/vector-compare-combines.ll47
-rw-r--r--test/CodeGen/X86/vector-compare-results.ll6625
-rw-r--r--test/CodeGen/X86/vector-gep.ll9
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll3922
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-128.ll622
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-256.ll545
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-512.ll2392
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-128.ll592
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-256.ll551
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-512.ll2100
-rw-r--r--test/CodeGen/X86/vector-idiv.ll1297
-rw-r--r--test/CodeGen/X86/vector-lzcnt-128.ll1216
-rw-r--r--test/CodeGen/X86/vector-lzcnt-256.ll1378
-rw-r--r--test/CodeGen/X86/vector-lzcnt-512.ll171
-rw-r--r--test/CodeGen/X86/vector-pcmp.ll495
-rw-r--r--test/CodeGen/X86/vector-popcnt-512.ll288
-rw-r--r--test/CodeGen/X86/vector-rem.ll118
-rw-r--r--test/CodeGen/X86/vector-rotate-128.ll97
-rw-r--r--test/CodeGen/X86/vector-rotate-256.ll46
-rw-r--r--test/CodeGen/X86/vector-sext.ll834
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll53
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll44
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-512.ll48
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll71
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll44
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-512.ll30
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll32
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll11
-rw-r--r--test/CodeGen/X86/vector-shift-shl-512.ll10
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll81
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v2.ll142
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v4.ll211
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll82
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll338
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll180
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll105
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll250
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v16.ll125
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v32.ll74
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v64.ll88
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll364
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx.ll242
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx2.ll324
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll515
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-ssse3.ll267
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-xop.ll133
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining.ll99
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse1.ll40
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse41.ll59
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll158
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-128.ll1321
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-256.ll720
-rw-r--r--test/CodeGen/X86/vector-trunc-math.ll5315
-rw-r--r--test/CodeGen/X86/vector-trunc.ll44
-rw-r--r--test/CodeGen/X86/vector-tzcnt-128.ll818
-rw-r--r--test/CodeGen/X86/vector-tzcnt-256.ll446
-rw-r--r--test/CodeGen/X86/vector-tzcnt-512.ll693
-rw-r--r--test/CodeGen/X86/vector-zext.ll326
-rw-r--r--test/CodeGen/X86/viabs.ll703
-rw-r--r--test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll4
-rw-r--r--test/CodeGen/X86/vselect-avx.ll102
-rw-r--r--test/CodeGen/X86/vselect-minmax.ll1
-rw-r--r--test/CodeGen/X86/vzero-excess.ll94
-rw-r--r--test/CodeGen/X86/warn-stack.ll2
-rw-r--r--test/CodeGen/X86/weak_def_can_be_hidden.ll8
-rw-r--r--test/CodeGen/X86/widen_bitops-0.ll307
-rw-r--r--test/CodeGen/X86/widen_bitops-1.ll235
-rw-r--r--test/CodeGen/X86/widen_compare-1.ll21
-rw-r--r--test/CodeGen/X86/widen_conv-1.ll95
-rw-r--r--test/CodeGen/X86/widen_conv-2.ll25
-rw-r--r--test/CodeGen/X86/widen_conv-3.ll147
-rw-r--r--test/CodeGen/X86/widen_conv-4.ll173
-rw-r--r--test/CodeGen/X86/widen_load-1.ll4
-rw-r--r--test/CodeGen/X86/widen_load-2.ll221
-rw-r--r--test/CodeGen/X86/win-alloca-expander.ll154
-rw-r--r--test/CodeGen/X86/win-catchpad-csrs.ll4
-rw-r--r--test/CodeGen/X86/win-catchpad-varargs.ll4
-rw-r--r--test/CodeGen/X86/win-catchpad.ll68
-rw-r--r--test/CodeGen/X86/win-cleanuppad.ll8
-rw-r--r--test/CodeGen/X86/win32-eh-states.ll16
-rw-r--r--test/CodeGen/X86/win32-eh.ll50
-rw-r--r--test/CodeGen/X86/win32-seh-catchpad-realign.ll2
-rw-r--r--test/CodeGen/X86/win32-seh-catchpad.ll8
-rw-r--r--test/CodeGen/X86/win32-seh-nested-finally.ll18
-rw-r--r--test/CodeGen/X86/win32_sret.ll47
-rw-r--r--test/CodeGen/X86/win64_eh.ll4
-rw-r--r--test/CodeGen/X86/win_cst_pool.ll26
-rw-r--r--test/CodeGen/X86/x86-16.ll20
-rw-r--r--test/CodeGen/X86/x86-32-intrcc.ll18
-rw-r--r--test/CodeGen/X86/x86-32-vector-calling-conv.ll24
-rw-r--r--test/CodeGen/X86/x86-64-flags-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/x86-64-intrcc.ll21
-rw-r--r--test/CodeGen/X86/x86-64-pic.ll8
-rw-r--r--test/CodeGen/X86/x86-64-plt-relative-reloc.ll19
-rw-r--r--test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll6
-rw-r--r--test/CodeGen/X86/x86-big-ret.ll22
-rw-r--r--test/CodeGen/X86/x86-flags-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/x86-interrupt_cc.ll33
-rw-r--r--test/CodeGen/X86/x86-interrupt_cld.ll17
-rw-r--r--test/CodeGen/X86/x86-interrupt_vzeroupper.ll19
-rw-r--r--test/CodeGen/X86/x86-plt-relative-reloc.ll16
-rw-r--r--test/CodeGen/X86/x86-shrink-wrap-unwind.ll4
-rw-r--r--test/CodeGen/X86/x86-shrink-wrapping.ll2
-rw-r--r--test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll29
-rw-r--r--test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll12
-rw-r--r--test/CodeGen/X86/x87.ll55
-rw-r--r--test/CodeGen/X86/xaluo.ll26
-rw-r--r--test/CodeGen/X86/xmulo.ll18
-rw-r--r--test/CodeGen/X86/xop-intrinsics-fast-isel.ll1111
-rw-r--r--test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll727
-rw-r--r--test/CodeGen/X86/xop-intrinsics-x86_64.ll950
-rw-r--r--test/CodeGen/X86/xop-mask-comments.ll188
-rw-r--r--test/CodeGen/X86/xray-attribute-instrumentation.ll13
-rw-r--r--test/CodeGen/X86/xray-selective-instrumentation-miss.ll9
-rw-r--r--test/CodeGen/X86/xray-selective-instrumentation.ll9
-rw-r--r--test/CodeGen/X86/zext-fold.ll7
692 files changed, 131173 insertions, 25670 deletions
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
index 46c5e88955f4..acd32e49e60d 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
-; RUN: grep asm-printer | grep 16
+; RUN: llc < %s -mtriple=i686-unknown-linux -relocation-model=static -stats 2>&1 | \
+; RUN: grep asm-printer | grep 14
;
; It's possible to schedule this in 14 instructions by avoiding
; callee-save registers, but the scheduler isn't currently that
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index b6a8fc0bb2f8..9e1bf9edbbc4 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -6,7 +6,14 @@ target triple = "i686-pc-linux-gnu"
define i32 @main() {
; CHECK-LABEL: main:
; CHECK-NOT: ret
-; CHECK: subl $4, %{{.*}}
+; CHECK: subl $12, %esp
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: calll cexp
+; CHECK: addl $28, %esp
; CHECK: ret
entry:
diff --git a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
index aa0ee5d07462..85a144083ece 100644
--- a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
+++ b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
@@ -2,9 +2,10 @@
@X = global i32 0 ; <i32*> [#uses=1]
-define signext i8 @_Z3fooi(i32 %x) {
+define i32 @_Z3fooi(i32 %x) {
entry:
store i32 %x, i32* @X, align 4
%retval67 = trunc i32 %x to i8 ; <i8> [#uses=1]
- ret i8 %retval67
+ %retval = sext i8 %retval67 to i32
+ ret i32 %retval
}
diff --git a/test/CodeGen/X86/2007-08-13-AppendingLinkage.ll b/test/CodeGen/X86/2007-08-13-AppendingLinkage.ll
deleted file mode 100644
index e08a5c493b5c..000000000000
--- a/test/CodeGen/X86/2007-08-13-AppendingLinkage.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86 | not grep drectve
-; PR1607
-
-%hlvm_programs_element = type { i8*, i32 (i32, i8**)* }
-@hlvm_programs = appending constant [1 x %hlvm_programs_element]
-zeroinitializer
-
-define %hlvm_programs_element* @hlvm_get_programs() {
-entry:
- ret %hlvm_programs_element* getelementptr([1 x %hlvm_programs_element], [1 x %hlvm_programs_element]*
- @hlvm_programs, i32 0, i32 0)
-}
diff --git a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
index c6eb6f0f0d7a..65e5ed762135 100644
--- a/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2007-10-15-CoalescerCrash.ll
@@ -224,7 +224,7 @@ declare void @fancy_abort(i8*, i32, i8*)
declare i8* @pool_alloc(%struct.alloc_pool_def*)
-declare void @llvm.memset.i64(i8*, i8, i64, i32)
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
declare void @link_block(%struct.basic_block_def*, %struct.basic_block_def*)
diff --git a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
index de95e7925f08..581fae269021 100644
--- a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
+++ b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -fixup-byte-word-insts=0 | FileCheck %s -check-prefix=CHECK -check-prefix=BWOFF
+; RUN: llc < %s -march=x86 -fixup-byte-word-insts=1 | FileCheck %s -check-prefix=CHECK -check-prefix=BWON
; These transforms are turned off for load volatiles and stores.
; Check that they weren't turned off for all loads and stores!
; CHECK-LABEL: f:
; CHECK-NOT: movsd
-; CHECK: movw
+; BWOFF: movw
+; BWON: movzwl
; CHECK: addw
@atomic = global double 0.000000e+00 ; <double*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-07-19-movups-spills.ll b/test/CodeGen/X86/2008-07-19-movups-spills.ll
index 45ea69943e87..052cf97fefe9 100644
--- a/test/CodeGen/X86/2008-07-19-movups-spills.ll
+++ b/test/CodeGen/X86/2008-07-19-movups-spills.ll
@@ -4,38 +4,38 @@
; Verify that movups is still generated with an aligned stack for the globals
; that must be accessed unaligned
-external global <4 x float>, align 1 ; <<4 x float>*>:0 [#uses=2]
-external global <4 x float>, align 1 ; <<4 x float>*>:1 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:2 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:3 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:4 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:5 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:6 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:7 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:8 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:9 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:10 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:11 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:12 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:13 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:14 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:15 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:16 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:17 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:18 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:19 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:20 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:21 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:22 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:23 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:24 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:25 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:26 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:27 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:28 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:29 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:30 [#uses=1]
-external global <4 x float>, align 1 ; <<4 x float>*>:31 [#uses=1]
+@0 = external global <4 x float>, align 1 ; <<4 x float>*>:0 [#uses=2]
+@1 = external global <4 x float>, align 1 ; <<4 x float>*>:1 [#uses=1]
+@2 = external global <4 x float>, align 1 ; <<4 x float>*>:2 [#uses=1]
+@3 = external global <4 x float>, align 1 ; <<4 x float>*>:3 [#uses=1]
+@4 = external global <4 x float>, align 1 ; <<4 x float>*>:4 [#uses=1]
+@5 = external global <4 x float>, align 1 ; <<4 x float>*>:5 [#uses=1]
+@6 = external global <4 x float>, align 1 ; <<4 x float>*>:6 [#uses=1]
+@7 = external global <4 x float>, align 1 ; <<4 x float>*>:7 [#uses=1]
+@8 = external global <4 x float>, align 1 ; <<4 x float>*>:8 [#uses=1]
+@9 = external global <4 x float>, align 1 ; <<4 x float>*>:9 [#uses=1]
+@10 = external global <4 x float>, align 1 ; <<4 x float>*>:10 [#uses=1]
+@11 = external global <4 x float>, align 1 ; <<4 x float>*>:11 [#uses=1]
+@12 = external global <4 x float>, align 1 ; <<4 x float>*>:12 [#uses=1]
+@13 = external global <4 x float>, align 1 ; <<4 x float>*>:13 [#uses=1]
+@14 = external global <4 x float>, align 1 ; <<4 x float>*>:14 [#uses=1]
+@15 = external global <4 x float>, align 1 ; <<4 x float>*>:15 [#uses=1]
+@16 = external global <4 x float>, align 1 ; <<4 x float>*>:16 [#uses=1]
+@17 = external global <4 x float>, align 1 ; <<4 x float>*>:17 [#uses=1]
+@18 = external global <4 x float>, align 1 ; <<4 x float>*>:18 [#uses=1]
+@19 = external global <4 x float>, align 1 ; <<4 x float>*>:19 [#uses=1]
+@20 = external global <4 x float>, align 1 ; <<4 x float>*>:20 [#uses=1]
+@21 = external global <4 x float>, align 1 ; <<4 x float>*>:21 [#uses=1]
+@22 = external global <4 x float>, align 1 ; <<4 x float>*>:22 [#uses=1]
+@23 = external global <4 x float>, align 1 ; <<4 x float>*>:23 [#uses=1]
+@24 = external global <4 x float>, align 1 ; <<4 x float>*>:24 [#uses=1]
+@25 = external global <4 x float>, align 1 ; <<4 x float>*>:25 [#uses=1]
+@26 = external global <4 x float>, align 1 ; <<4 x float>*>:26 [#uses=1]
+@27 = external global <4 x float>, align 1 ; <<4 x float>*>:27 [#uses=1]
+@28 = external global <4 x float>, align 1 ; <<4 x float>*>:28 [#uses=1]
+@29 = external global <4 x float>, align 1 ; <<4 x float>*>:29 [#uses=1]
+@30 = external global <4 x float>, align 1 ; <<4 x float>*>:30 [#uses=1]
+@31 = external global <4 x float>, align 1 ; <<4 x float>*>:31 [#uses=1]
declare void @abort()
diff --git a/test/CodeGen/X86/2008-07-22-CombinerCrash.ll b/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
index 35bb5f054282..719baf5cc945 100644
--- a/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
+++ b/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
@@ -1,8 +1,8 @@
; RUN: llc < %s -march=x86 -mattr=+sse2
; PR2566
-external global i16 ; <i16*>:0 [#uses=1]
-external global <4 x i16> ; <<4 x i16>*>:1 [#uses=1]
+@0 = external global i16 ; <i16*>:0 [#uses=1]
+@1 = external global <4 x i16> ; <<4 x i16>*>:1 [#uses=1]
declare void @abort()
diff --git a/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll b/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll
index 32f6ca0ce086..907f4cc4ca3f 100644
--- a/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll
+++ b/test/CodeGen/X86/2008-08-17-UComiCodeGenBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movzbl
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep xorl
define i32 @foo(<4 x float> %a, <4 x float> %b) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-08-19-SubAndFetch.ll b/test/CodeGen/X86/2008-08-19-SubAndFetch.ll
deleted file mode 100644
index 9324d5dfa3bb..000000000000
--- a/test/CodeGen/X86/2008-08-19-SubAndFetch.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-
-@var = external global i64 ; <i64*> [#uses=1]
-
-define i32 @main() nounwind {
-entry:
-; CHECK-LABEL: main:
-; CHECK: lock
-; CHECK: decq
- atomicrmw sub i64* @var, i64 1 monotonic
- unreachable
-}
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
index 757dff4230fc..a9875521fb18 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -march=x86
-; RUN: llc -pre-RA-sched=source < %s -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix=SOURCE-SCHED
+; RUN: llc -pre-RA-sched=source < %s -mtriple=i686-unknown-linux -mcpu=corei7 | FileCheck %s --check-prefix=SOURCE-SCHED
; PR2748
@g_73 = external global i32 ; <i32*> [#uses=1]
@@ -10,9 +10,9 @@ entry:
; SOURCE-SCHED: subl
; SOURCE-SCHED: movl
; SOURCE-SCHED: sarl
+; SOURCE-SCHED: xorl
; SOURCE-SCHED: cmpl
; SOURCE-SCHED: setg
-; SOURCE-SCHED: movzbl
; SOURCE-SCHED: movb
; SOURCE-SCHED: xorl
; SOURCE-SCHED: subl
diff --git a/test/CodeGen/X86/2008-09-29-ReMatBug.ll b/test/CodeGen/X86/2008-09-29-ReMatBug.ll
index 754fd8f0ab64..cc481a056c84 100644
--- a/test/CodeGen/X86/2008-09-29-ReMatBug.ll
+++ b/test/CodeGen/X86/2008-09-29-ReMatBug.ll
@@ -5,7 +5,7 @@
%struct.XCStringList = type { i32, %struct._XCStringListNode* }
%struct._XCStringListNode = type { [3 x i8], [0 x i8], i8 }
%struct.__builtin_CFString = type { i32*, i32, i8*, i32 }
-internal constant %struct.__builtin_CFString { i32* getelementptr ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr ([3 x i8], [3 x i8]* @"\01LC", i32 0, i32 0), i32 2 } ; <%struct.__builtin_CFString*>:0 [#uses=1]
+@0 = internal constant %struct.__builtin_CFString { i32* getelementptr ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr ([3 x i8], [3 x i8]* @"\01LC", i32 0, i32 0), i32 2 } ; <%struct.__builtin_CFString*>:0 [#uses=1]
@__CFConstantStringClassReference = external global [0 x i32] ; <[0 x i32]*> [#uses=1]
@"\01LC" = internal constant [3 x i8] c"NO\00" ; <[3 x i8]*> [#uses=1]
@"\01LC1" = internal constant [1 x i8] zeroinitializer ; <[1 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 2abb5ba7cd52..8edaf3f1fa34 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s
-; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -verify-machineinstrs | FileCheck %s
; PR3538
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9"
@@ -12,7 +12,7 @@ define signext i8 @foo(i8* %s1) nounwind ssp {
; movq %rax, %rsp
; CHECK-LABEL: @foo
-; CHECK: movq -40(%rbp), %rsp
+; CHECK: movq -{{[0-9]+}}(%rbp), %rsp
entry:
%s1_addr = alloca i8* ; <i8**> [#uses=2]
@@ -76,9 +76,10 @@ declare i64 @strlen(i8*) nounwind readonly
declare void @llvm.stackrestore(i8*) nounwind
+!llvm.dbg.cu = !{!2}
!0 = !DILocalVariable(name: "s1", line: 2, arg: 1, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !2, type: !3)
-!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, scope: !2, type: !3)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !17, enums: !18, retainedTypes: !18)
!3 = !DISubroutineType(types: !4)
!4 = !{!5, !6}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
diff --git a/test/CodeGen/X86/2009-03-05-burr-list-crash.ll b/test/CodeGen/X86/2009-03-05-burr-list-crash.ll
index 853bb16aa327..e8b6a3142697 100644
--- a/test/CodeGen/X86/2009-03-05-burr-list-crash.ll
+++ b/test/CodeGen/X86/2009-03-05-burr-list-crash.ll
@@ -2,7 +2,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
-external global i32 ; <i32*>:0 [#uses=1]
+@0 = external global i32 ; <i32*>:0 [#uses=1]
declare i64 @strlen(i8* nocapture) nounwind readonly
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index 06a56ad90205..840a479de251 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -22,10 +22,11 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
declare i32 @foo(i32) ssp
+!llvm.dbg.cu = !{!3}
!0 = !DILocation(line: 5, column: 2, scope: !1)
!1 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !2)
-!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !3)
-!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: 0, file: !8, retainedTypes: !9)
+!2 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scope: !3)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: FullDebug, file: !8, retainedTypes: !9)
!4 = !DILocalVariable(name: "count_", line: 5, scope: !5, file: !3, type: !6)
!5 = distinct !DILexicalBlock(line: 1, column: 1, file: null, scope: !1)
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index c15e7a79bfa1..8b11fd86ef17 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -32,9 +32,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
!llvm.module.flags = !{!21}
!0 = !DILocalVariable(name: "my_r0", line: 11, arg: 1, scope: !1, file: !2, type: !7)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !19, scope: !2, type: !4)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 11, file: !19, scope: !2, type: !4)
!2 = !DIFile(filename: "b2.c", directory: "/tmp/")
-!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 0, file: !19, enums: !20, retainedTypes: !20, subprograms: !18)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: FullDebug, file: !19, enums: !20, retainedTypes: !20)
!4 = !DISubroutineType(types: !5)
!5 = !{!6, !7}
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
@@ -49,7 +49,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
!15 = !DILocation(line: 11, scope: !1)
!16 = !DILocation(line: 12, scope: !17)
!17 = distinct !DILexicalBlock(line: 11, column: 0, file: !19, scope: !1)
-!18 = !{!1}
!19 = !DIFile(filename: "b2.c", directory: "/tmp/")
!20 = !{}
!21 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index eb077c074bc2..b4bb865f7f7e 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -18,7 +18,8 @@ entry:
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone
-!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16)
+!llvm.dbg.cu = !{!0}
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !15, enums: !16, retainedTypes: !16)
!1 = !DIDerivedType(tag: DW_TAG_const_type, size: 192, align: 64, file: !15, scope: !0, baseType: !2)
!2 = !DICompositeType(tag: DW_TAG_structure_type, name: "C", line: 1, size: 192, align: 64, file: !15, scope: !0, elements: !3)
!3 = !{!4, !6, !7}
@@ -28,7 +29,7 @@ declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.
!7 = !DIDerivedType(tag: DW_TAG_member, name: "z", line: 1, size: 64, align: 64, offset: 128, file: !15, scope: !2, baseType: !5)
!8 = !DILocalVariable(name: "t", line: 5, scope: !9, file: !0, type: !2)
!9 = distinct !DILexicalBlock(line: 0, column: 0, file: null, scope: !10)
-!10 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !0, type: !11)
+!10 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !0, scope: !0, type: !11)
!11 = !DISubroutineType(types: !12)
!12 = !{!13}
!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index f157d5011b02..3172f82b2860 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -200,9 +200,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.module.flags = !{!48}
!0 = !DILocalVariable(name: "a", line: 1921, arg: 1, scope: !1, file: !2, type: !9)
-!1 = distinct !DISubprogram(name: "__divsc3", linkageName: "__divsc3", line: 1922, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 1922, file: !45, scope: !2, type: !4, variables: !43)
+!1 = distinct !DISubprogram(name: "__divsc3", linkageName: "__divsc3", line: 1922, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !3, scopeLine: 1922, file: !45, scope: !2, type: !4, variables: !43)
!2 = !DIFile(filename: "libgcc2.c", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
-!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !45, enums: !47, retainedTypes: !47, subprograms: !44, imports: null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !45, enums: !47, retainedTypes: !47, imports: null)
!4 = !DISubroutineType(types: !5)
!5 = !{!6, !9, !9, !9, !9}
!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "SCtype", line: 170, file: !46, scope: !7, baseType: !8)
@@ -243,7 +243,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!41 = !DILocation(line: 1965, scope: !15)
!42 = !DILocation(line: 1969, scope: !15)
!43 = !{!0, !11, !12, !13, !14, !16, !17, !18}
-!44 = !{!1}
!45 = !DIFile(filename: "libgcc2.c", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
!46 = !DIFile(filename: "libgcc2.h", directory: "/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc")
!47 = !{}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index a34e7bd9fe43..30e5e346d294 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -26,14 +26,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!0 = !DIGlobalVariable(name: "ret", line: 7, isLocal: false, isDefinition: true, scope: !1, file: !1, type: !3)
!1 = !DIFile(filename: "foo.c", directory: "/tmp/")
-!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !36, enums: !37, retainedTypes: !37, subprograms: !32, globals: !31, imports: !37)
+!2 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !36, enums: !37, retainedTypes: !37, globals: !31, imports: !37)
!3 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!4 = !DILocalVariable(name: "x", line: 12, arg: 1, scope: !5, file: !1, type: !3)
-!5 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 13, file: !36, scope: !1, type: !6, variables: !33)
+!5 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 13, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, scopeLine: 13, file: !36, scope: !1, type: !6, variables: !33)
!6 = !DISubroutineType(types: !7)
!7 = !{null, !3}
!8 = !DILocalVariable(name: "myvar", line: 17, arg: 1, scope: !9, file: !1, type: !13)
-!9 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 17, file: !36, scope: !1, type: !10, variables: !34)
+!9 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 17, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, scopeLine: 17, file: !36, scope: !1, type: !10, variables: !34)
!10 = !DISubroutineType(types: !11)
!11 = !{!12, !13}
!12 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !36, scope: !1, baseType: null)
@@ -43,7 +43,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!16 = !DIDerivedType(tag: DW_TAG_member, name: "c", line: 3, size: 32, align: 32, file: !36, scope: !14, baseType: !3)
!17 = !DIDerivedType(tag: DW_TAG_member, name: "d", line: 4, size: 64, align: 64, offset: 64, file: !36, scope: !14, baseType: !13)
!18 = !DILocalVariable(name: "argc", line: 22, arg: 1, scope: !19, file: !1, type: !3)
-!19 = distinct !DISubprogram(name: "main", linkageName: "main", line: 22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 22, file: !36, scope: !1, type: !20, variables: !35)
+!19 = distinct !DISubprogram(name: "main", linkageName: "main", line: 22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, scopeLine: 22, file: !36, scope: !1, type: !20, variables: !35)
!20 = !DISubroutineType(types: !21)
!21 = !{!3, !3, !22}
!22 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !36, scope: !1, baseType: !23)
@@ -56,7 +56,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!29 = distinct !DILexicalBlock(line: 17, column: 0, file: !36, scope: !9)
!30 = !DILocation(line: 19, scope: !29)
!31 = !{!0}
-!32 = !{!5, !9, !19}
!33 = !{!4}
!34 = !{!8}
!35 = !{!18, !25, !26}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 7967d45c2ee8..38bbe4e367b1 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -26,14 +26,14 @@ entry:
!llvm.module.flags = !{!20}
!0 = !DILocalVariable(name: "y", line: 2, arg: 1, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 2, file: !18, scope: !2, type: !4, variables: !15)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !3, scopeLine: 2, file: !18, scope: !2, type: !4, variables: !15)
!2 = !DIFile(filename: "f.c", directory: "/tmp")
-!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: 1, file: !18, enums: !19, retainedTypes: !19, subprograms: !17, imports: null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: true, emissionKind: FullDebug, file: !18, enums: !19, retainedTypes: !19, imports: null)
!4 = !DISubroutineType(types: !5)
!5 = !{!6, !6}
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!7 = !DILocalVariable(name: "x", line: 6, arg: 1, scope: !8, file: !2, type: !6)
-!8 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 6, file: !18, scope: !2, type: !4, variables: !16)
+!8 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !3, scopeLine: 6, file: !18, scope: !2, type: !4, variables: !16)
!9 = !DILocation(line: 3, scope: !10)
!10 = distinct !DILexicalBlock(line: 2, column: 0, file: !18, scope: !1)
!11 = !{i32 1}
@@ -42,7 +42,6 @@ entry:
!14 = distinct !DILexicalBlock(line: 6, column: 0, file: !18, scope: !8)
!15 = !{!0}
!16 = !{!7}
-!17 = !{!1, !8}
!18 = !DIFile(filename: "f.c", directory: "/tmp")
!19 = !{}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 1be800cdfcf0..fa3932d26698 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -24,14 +24,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
!0 = !DILocalVariable(name: "this", line: 11, arg: 1, scope: !1, file: !3, type: !12)
-!1 = distinct !DISubprogram(name: "bar", linkageName: "_ZN3foo3barEi", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 11, file: !31, scope: !2, type: !9)
+!1 = distinct !DISubprogram(name: "bar", linkageName: "_ZN3foo3barEi", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !4, scopeLine: 11, file: !31, scope: !2, type: !9)
!2 = !DICompositeType(tag: DW_TAG_structure_type, name: "foo", line: 3, size: 32, align: 32, file: !31, scope: !3, elements: !5)
!3 = !DIFile(filename: "foo.cp", directory: "/tmp/")
-!4 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 LLVM build", isOptimized: true, emissionKind: 0, file: !31, enums: !32, retainedTypes: !32, subprograms: !33)
+!4 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 LLVM build", isOptimized: true, emissionKind: FullDebug, file: !31, enums: !32, retainedTypes: !32)
!5 = !{!6, !1, !8}
!6 = !DIDerivedType(tag: DW_TAG_member, name: "y", line: 8, size: 32, align: 32, file: !31, scope: !2, baseType: !7)
!7 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!8 = distinct !DISubprogram(name: "baz", linkageName: "_ZN3foo3bazEi", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 15, file: !31, scope: !2, type: !9)
+!8 = distinct !DISubprogram(name: "baz", linkageName: "_ZN3foo3bazEi", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !4, scopeLine: 15, file: !31, scope: !2, type: !9)
!9 = !DISubroutineType(types: !10)
!10 = !{!7, !11, !7}
!11 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial, file: !31, scope: !3, baseType: !2)
@@ -41,7 +41,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!15 = !DILocalVariable(name: "this", line: 15, arg: 1, scope: !8, file: !3, type: !12)
!16 = !DILocalVariable(name: "x", line: 15, arg: 2, scope: !8, file: !3, type: !7)
!17 = !DILocalVariable(name: "argc", line: 19, arg: 1, scope: !18, file: !3, type: !7)
-!18 = distinct !DISubprogram(name: "main", linkageName: "main", line: 19, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, scopeLine: 19, file: !31, scope: !3, type: !19)
+!18 = distinct !DISubprogram(name: "main", linkageName: "main", line: 19, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !4, scopeLine: 19, file: !31, scope: !3, type: !19)
!19 = !DISubroutineType(types: !20)
!20 = !{!7, !7, !21}
!21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !31, scope: !3, baseType: !22)
@@ -56,5 +56,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!30 = distinct !DILexicalBlock(line: 15, column: 0, file: !31, scope: !8)
!31 = !DIFile(filename: "foo.cp", directory: "/tmp/")
!32 = !{}
-!33 = !{!1, !8, !18}
!34 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index 5e565a1a667f..f86a7601e219 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -1,14 +1,16 @@
; RUN: llc -O0 -relocation-model pic < %s -o /dev/null
; REQUIRES: default_triple
; PR7545
+
@.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1]
@.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1]
@C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str1, i64 0, i64 0)]
+!llvm.dbg.cu = !{!39}
!38 = !DIFile(filename: "pbmsrch.c", directory: "/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch")
-!39 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", isOptimized: true, emissionKind: 0, file: !109, enums: !108, retainedTypes: !108)
+!39 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", isOptimized: true, emissionKind: FullDebug, file: !109, enums: !108, retainedTypes: !108)
!46 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !109, baseType: !47)
!47 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!97 = distinct !DISubprogram(name: "main", linkageName: "main", line: 73, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scope: !39, type: !98)
+!97 = distinct !DISubprogram(name: "main", linkageName: "main", line: 73, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !39, scope: !39, type: !98)
!98 = !DISubroutineType(types: !99)
!99 = !{!100}
!100 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -29,4 +31,3 @@ bb.nph:
}
declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index d305d678c596..e63a36d7fa36 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -81,7 +81,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!0 = !DISubprogram(name: "SVal", line: 11, isLocal: false, isDefinition: false, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !47, scope: !1, type: !14)
!1 = !DICompositeType(tag: DW_TAG_structure_type, name: "SVal", line: 1, size: 128, align: 64, file: !47, scope: !2, elements: !4)
!2 = !DIFile(filename: "small.cc", directory: "/Users/manav/R8248330")
-!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: 1, file: !47, enums: !48, retainedTypes: !48, subprograms: !46, imports: null)
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", isOptimized: false, emissionKind: FullDebug, file: !47, enums: !48, retainedTypes: !48, imports: null)
!4 = !{!5, !7, !0, !9}
!5 = !DIDerivedType(tag: DW_TAG_member, name: "Data", line: 7, size: 64, align: 64, file: !47, scope: !1, baseType: !6)
!6 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, file: !47, scope: !2, baseType: null)
@@ -94,11 +94,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!13 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!14 = !DISubroutineType(types: !15)
!15 = !{null, !12}
-!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 11, file: !47, scope: !1, type: !14)
-!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 16, file: !47, scope: !2, type: !18)
+!16 = distinct !DISubprogram(name: "SVal", linkageName: "_ZN4SValC1Ev", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 11, file: !47, scope: !1, type: !14)
+!17 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi4SVal", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 16, file: !47, scope: !2, type: !18)
!18 = !DISubroutineType(types: !19)
!19 = !{!13, !13, !1}
-!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 23, file: !47, scope: !2, type: !21)
+!20 = distinct !DISubprogram(name: "main", linkageName: "main", line: 23, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 23, file: !47, scope: !2, type: !21)
!21 = !DISubroutineType(types: !22)
!22 = !{!13}
!23 = !DILocalVariable(name: "i", line: 16, arg: 1, scope: !17, file: !2, type: !13)
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index 4303ca991a86..0291ce0da468 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -15,20 +15,19 @@ entry:
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!17}
-!0 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 53, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !14, scope: !1, type: !3)
+!0 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 53, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, file: !14, scope: !1, type: !3)
!1 = !DIFile(filename: "", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 114084)", isOptimized: false, emissionKind: 0, file: !15, enums: !16, retainedTypes: !16, subprograms: !13)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 114084)", isOptimized: false, emissionKind: FullDebug, file: !15, enums: !16, retainedTypes: !16)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!6 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, file: !15, scope: !7, type: !3)
+!6 = distinct !DISubprogram(name: "bar", linkageName: "bar", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2, file: !15, scope: !7, type: !3)
!7 = !DIFile(filename: "bug.c", directory: "/private/tmp")
!8 = !DILocation(line: 53, column: 13, scope: !9)
!9 = distinct !DILexicalBlock(line: 53, column: 11, file: !14, scope: !0)
!10 = !DILocation(line: 4, column: 13, scope: !11)
!11 = distinct !DILexicalBlock(line: 4, column: 13, file: !15, scope: !12)
!12 = distinct !DILexicalBlock(line: 4, column: 11, file: !15, scope: !6)
-!13 = !{!0, !6}
!14 = !DIFile(filename: "", directory: "/private/tmp")
!15 = !DIFile(filename: "bug.c", directory: "/private/tmp")
!16 = !{}
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index b091003585c2..be2d040a0dcc 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -18,9 +18,9 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!19}
-!0 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !17, scope: !1, type: !3, variables: !16)
+!0 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, scopeLine: 3, file: !17, scope: !1, type: !3, variables: !16)
!1 = !DIFile(filename: "one.c", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 117922)", isOptimized: true, emissionKind: 0, file: !17, enums: !18, retainedTypes: !18, subprograms: !15, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 117922)", isOptimized: true, emissionKind: FullDebug, file: !17, enums: !18, retainedTypes: !18, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -33,7 +33,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!12 = !DILocation(line: 3, column: 47, scope: !0)
!13 = !DILocation(line: 4, column: 2, scope: !14)
!14 = distinct !DILexicalBlock(line: 3, column: 50, file: !17, scope: !0)
-!15 = !{!0}
!16 = !{!6}
!17 = !DIFile(filename: "one.c", directory: "/private/tmp")
!18 = !{}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 661ec94fee4e..d4f4e9057105 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -78,13 +78,13 @@ declare i32 @puts(i8* nocapture) nounwind
!llvm.dbg.cu = !{!2}
!llvm.module.flags = !{!33}
-!0 = distinct !DISubprogram(name: "gcd", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !31, scope: !1, type: !3, variables: !29)
+!0 = distinct !DISubprogram(name: "gcd", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, file: !31, scope: !1, type: !3, variables: !29)
!1 = !DIFile(filename: "rem_small.c", directory: "/private/tmp")
-!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 124117)", isOptimized: true, emissionKind: 1, file: !31, enums: !32, retainedTypes: !32, subprograms: !28, imports: null)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 2.9 (trunk 124117)", isOptimized: true, emissionKind: FullDebug, file: !31, enums: !32, retainedTypes: !32, imports: null)
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
-!6 = distinct !DISubprogram(name: "main", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, file: !31, scope: !1, type: !7, variables: !30)
+!6 = distinct !DISubprogram(name: "main", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2, file: !31, scope: !1, type: !7, variables: !30)
!7 = !DISubroutineType(types: !8)
!8 = !{!9}
!9 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
@@ -106,7 +106,6 @@ declare i32 @puts(i8* nocapture) nounwind
!25 = !DILocation(line: 27, column: 38, scope: !15)
!26 = !DILocation(line: 28, column: 9, scope: !15)
!27 = !DILocation(line: 30, column: 1, scope: !15)
-!28 = !{!0, !6}
!29 = !{!10, !11, !12}
!30 = !{!14, !17}
!31 = !DIFile(filename: "rem_small.c", directory: "/private/tmp")
diff --git a/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
index 114b985f71d4..c9b3df83613d 100644
--- a/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
+++ b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
@@ -6,7 +6,7 @@
define i8 @f(i8 %v1, i8 %v2) nounwind {
entry:
; CHECK: callq
-; CHECK: movb %{{.*}}, %al
+; CHECK: movl %{{.*}}, %eax
; CHECK: mulb
; CHECK: mulb
%rval = tail call i8 @bar() nounwind
diff --git a/test/CodeGen/X86/2011-09-14-valcoalesce.ll b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
index b8e5100c53bb..812628bf0e70 100644
--- a/test/CodeGen/X86/2011-09-14-valcoalesce.ll
+++ b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
@@ -19,7 +19,7 @@
; reusing the pre-addition register later, or the post-addition one. Currently,
; it does the latter, so we check:
-; CHECK: # %while.body85.i
+; CHECK: # %while.body85.i{{$}}
; CHECK-NOT: # %
; CHECK-NOT: add
; CHECK: movl %[[POSTR:e[abcdxi]+]], %[[PRER:e[abcdxi]+]]
diff --git a/test/CodeGen/X86/2011-10-21-widen-cmp.ll b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
index cb4648c382f7..420e843b52a0 100644
--- a/test/CodeGen/X86/2011-10-21-widen-cmp.ll
+++ b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
@@ -42,7 +42,7 @@ entry:
define void @mp_11193(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
; CHECK-LABEL: mp_11193:
; CHECK: # BB#0: # %allocas
-; CHECK-NEXT: movl $-1082130432, (%rsi) # imm = 0xFFFFFFFFBF800000
+; CHECK-NEXT: movl $-1082130432, (%rsi) # imm = 0xBF800000
; CHECK-NEXT: retq
allocas:
%bincmp = fcmp olt <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 9.000000e+00, float 1.000000e+00, float 9.000000e+00, float 1.000000e+00> , <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
diff --git a/test/CodeGen/X86/2012-01-11-split-cv.ll b/test/CodeGen/X86/2012-01-11-split-cv.ll
index cb39ed911976..212acedafb94 100644
--- a/test/CodeGen/X86/2012-01-11-split-cv.ll
+++ b/test/CodeGen/X86/2012-01-11-split-cv.ll
@@ -1,12 +1,21 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx -mattr=+avx -mtriple=i686-pc-win32 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mattr=+avx -mtriple=i686-unknown-unknown | FileCheck %s
-;CHECK-LABEL: add18i16:
define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
-;CHECK: vmovaps
+; CHECK-LABEL: add18i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: vmovups (%ecx), %ymm0
+; CHECK-NEXT: movl 32(%ecx), %ecx
+; CHECK-NEXT: movl %ecx, 32(%eax)
+; CHECK-NEXT: vmovups %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl $4
+;
%b = load <18 x i16>, <18 x i16>* %bp, align 16
%x = add <18 x i16> zeroinitializer, %b
store <18 x i16> %x, <18 x i16>* %ret, align 16
-;CHECK: ret
ret void
}
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 6950641a08ae..9bc4b5f55b64 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -3,12 +3,13 @@
define void @endless_loop() {
; CHECK-LABEL: endless_loop:
; CHECK-NEXT: # BB#0:
-; CHECK-NEXT: vbroadcastss (%eax), %ymm0
-; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; CHECK-NEXT: vmovaps (%eax), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; CHECK-NEXT: vmovaps %ymm0, (%eax)
; CHECK-NEXT: vmovaps %ymm1, (%eax)
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 4e3f1f4a6e4d..2a76e1a66b2b 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -1,19 +1,31 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
; rdar://11314175: SD Scheduler, BuildSchedUnits assert:
; N->getNodeId() == -1 && "Node already inserted!
-; It's hard to test for the ISEL condition because CodeGen optimizes
-; away the bugpointed code. Just ensure the basics are still there.
-;CHECK-LABEL: func:
-;CHECK: vxorps
-;CHECK: vpshufd
-;CHECK: vpbroadcastd
-;CHECK: vinserti128
-;CHECK: vmulps
-;CHECK: vmulps
-;CHECK: ret
-
define void @func() nounwind ssp {
+; CHECK-LABEL: func:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups 0, %xmm0
+; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; CHECK-NEXT: vbroadcastss 32, %xmm3
+; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; CHECK-NEXT: vmulps %ymm0, %ymm2, %ymm2
+; CHECK-NEXT: vmulps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vhaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%tmp = load <4 x float>, <4 x float>* null, align 1
%tmp14 = getelementptr <4 x float>, <4 x float>* null, i32 2
%tmp15 = load <4 x float>, <4 x float>* %tmp14, align 1
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index eb237847e1bc..2d1b5960d98c 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -4,7 +4,7 @@
define void @bad_cast() {
; CHECK-LABEL: bad_cast:
; CHECK: # BB#0:
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmovaps %xmm0, (%eax)
; CHECK-NEXT: movl $0, (%eax)
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 50b486c6f925..495ff0304b1b 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -38,7 +38,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!12}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !11, enums: !2, retainedTypes: !2, subprograms: !13, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2)
!2 = !{}
!4 = !DILocalVariable(name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6)
!5 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
@@ -46,7 +46,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!7 = !DICompositeType(tag: DW_TAG_structure_type, line: 487, size: 512, align: 64, file: !11)
!11 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh")
!12 = !{i32 1, !"Debug Info Version", i32 3}
-!13 = !{!14}
-!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !11, scope: !5, type: !15)
+!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !11, scope: !5, type: !15)
!15 = !DISubroutineType(types: !16)
!16 = !{null}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 7ed416e36c22..fbe6000d7ace 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -65,7 +65,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!35}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: 0, file: !19, enums: !2, retainedTypes: !2, subprograms: !20, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !19, enums: !2, retainedTypes: !2, globals: !2)
!1 = !{!2}
!2 = !{}
!4 = !DILocalVariable(name: "num1", line: 815, scope: !5, file: !14, type: !15)
@@ -85,8 +85,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
!18 = !DISubrange(count: 20)
!19 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset")
-!20 = !{!21}
-!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22)
+!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !19, scope: !14, type: !22)
!22 = !DISubroutineType(types: !23)
!23 = !{null}
@@ -134,11 +133,10 @@ declare void @_Znwm()
!llvm.dbg.cu = !{!30}
-!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: 0, file: !34, enums: !2, retainedTypes: !2, subprograms: !36)
+!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: FullDebug, file: !34, enums: !2, retainedTypes: !2)
!31 = !DILocalVariable(name: "X", line: 29, scope: !37, type: !32)
!32 = !DIDerivedType(tag: DW_TAG_typedef, name: "HM", line: 28, file: !34, baseType: null)
!33 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
!34 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++")
!35 = !{i32 1, !"Debug Info Version", i32 3}
-!36 = !{!37}
-!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !19, scope: !14, type: !22)
+!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !30, scopeLine: 1, file: !19, scope: !14, type: !22)
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 3f7a10ae035b..a717202d3574 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -36,11 +36,10 @@ invoke.cont44: ; preds = %if.end
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!8}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: 0, file: !6, subprograms: !1)
-!1 = !{!2}
-!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 1, file: !6, scope: !5, type: !7)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: FullDebug, file: !6)
+!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !6, scope: !5, type: !7)
!3 = !DILocalVariable(name: "callback", line: 214, scope: !2, type: !4)
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 512, align: 64, file: !6)
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 64, align: 64, file: !6)
!5 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
!6 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet")
!7 = !DISubroutineType(types: !9)
diff --git a/test/CodeGen/X86/3addr-16bit.ll b/test/CodeGen/X86/3addr-16bit.ll
index 2d6a5e76657f..c80e91a4d8b0 100644
--- a/test/CodeGen/X86/3addr-16bit.ll
+++ b/test/CodeGen/X86/3addr-16bit.ll
@@ -12,7 +12,7 @@ entry:
; 64BIT-LABEL: t1:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal 1(%rsi), %eax
+; 64BIT: movl %esi, %eax
%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]
%1 = add i16 %k, 1 ; <i16> [#uses=3]
br i1 %0, label %bb, label %bb1
@@ -34,7 +34,7 @@ entry:
; 64BIT-LABEL: t2:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal -1(%rsi), %eax
+; 64BIT: movl %esi, %eax
; 64BIT: movzwl %ax
%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]
%1 = add i16 %k, -1 ; <i16> [#uses=3]
@@ -59,7 +59,7 @@ entry:
; 64BIT-LABEL: t3:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal 2(%rsi), %eax
+; 64BIT: movl %esi, %eax
%0 = add i16 %k, 2 ; <i16> [#uses=3]
%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
br i1 %1, label %bb, label %bb1
@@ -82,7 +82,7 @@ entry:
; 64BIT-LABEL: t4:
; 64BIT-NOT: movw %si, %ax
-; 64BIT: leal (%rsi,%rdi), %eax
+; 64BIT: movl %esi, %eax
%0 = add i16 %k, %c ; <i16> [#uses=3]
%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
br i1 %1, label %bb, label %bb1
diff --git a/test/CodeGen/X86/AppendingLinkage.ll b/test/CodeGen/X86/AppendingLinkage.ll
new file mode 100644
index 000000000000..1a49287d1b38
--- /dev/null
+++ b/test/CodeGen/X86/AppendingLinkage.ll
@@ -0,0 +1,4 @@
+; RUN: not llc < %s -march=x86 2>&1 | FileCheck %s
+
+; CHECK: unknown special variable
+@foo = appending constant [1 x i32 ]zeroinitializer
diff --git a/test/CodeGen/X86/GC/dynamic-frame-size.ll b/test/CodeGen/X86/GC/dynamic-frame-size.ll
index 9ec9b8b08507..0f9a8f57cf2a 100644
--- a/test/CodeGen/X86/GC/dynamic-frame-size.ll
+++ b/test/CodeGen/X86/GC/dynamic-frame-size.ll
@@ -15,7 +15,7 @@ define void @test(i8* %ptr) gc "erlang" {
}
; CHECK: .note.gc
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
; safe point count
; CHECK: .short 1
; CHECK: .long .Ltmp0
diff --git a/test/CodeGen/X86/GC/erlang-gc.ll b/test/CodeGen/X86/GC/erlang-gc.ll
index c55b7f6dcf61..c2cb8c7d6575 100644
--- a/test/CodeGen/X86/GC/erlang-gc.ll
+++ b/test/CodeGen/X86/GC/erlang-gc.ll
@@ -6,7 +6,7 @@ define i32 @main(i32 %x) nounwind gc "erlang" {
ret i32 0
; CHECK64: .section .note.gc,"",@progbits
-; CHECK64-NEXT: .align 8
+; CHECK64-NEXT: .p2align 3
; CHECK64-NEXT: .short 1 # safe point count
; CHECK64-NEXT: .long .Ltmp0 # safe point address
; CHECK64-NEXT: .short 1 # stack frame size (in words)
@@ -14,7 +14,7 @@ define i32 @main(i32 %x) nounwind gc "erlang" {
; CHECK64-NEXT: .short 0 # live root count
; CHECK32: .section .note.gc,"",@progbits
-; CHECK32-NEXT: .align 4
+; CHECK32-NEXT: .p2align 2
; CHECK32-NEXT: .short 1 # safe point count
; CHECK32-NEXT: .long .Ltmp0 # safe point address
; CHECK32-NEXT: .short 3 # stack frame size (in words)
diff --git a/test/CodeGen/X86/GC/ocaml-gc.ll b/test/CodeGen/X86/GC/ocaml-gc.ll
index 37ddaf90bf67..4e4e2e952f73 100644
--- a/test/CodeGen/X86/GC/ocaml-gc.ll
+++ b/test/CodeGen/X86/GC/ocaml-gc.ll
@@ -22,12 +22,12 @@ define i32 @main(i32 %x) nounwind gc "ocaml" {
; CHECK-NEXT: .globl "caml<stdin>__frametable"
; CHECK-NEXT: "caml<stdin>__frametable":
; CHECK-NEXT: .short 1
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: # live roots for main
; CHECK-NEXT: .quad .Ltmp0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
}
declare i32 @foo(i32)
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 457d9beb37d5..a794c896eb27 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -28,17 +28,17 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: 1, file: !20, enums: !21, retainedTypes: !21, subprograms: !18, imports: null)
-!1 = distinct !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, file: !20, scope: !2, type: !3, variables: !19)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", isOptimized: true, emissionKind: FullDebug, file: !20, enums: !21, retainedTypes: !21, imports: null)
+!1 = distinct !DISubprogram(name: "foo", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, file: !20, scope: !2, type: !3, variables: !19)
!2 = !DIFile(filename: "a.c", directory: "/private/tmp")
!3 = !DISubroutineType(types: !4)
!4 = !{!5}
!5 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!6 = !DILocalVariable(name: "i", line: 2, arg: 1, scope: !1, file: !2, type: !5)
!7 = !DILocalVariable(name: "c", line: 2, arg: 2, scope: !1, file: !2, type: !8)
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, scope: !0, baseType: !9)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, scope: !0, baseType: !5)
!9 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!10 = !DILocalVariable(name: "a", line: 3, scope: !11, file: !2, type: !9)
+!10 = !DILocalVariable(name: "a", line: 3, scope: !11, file: !2, type: !5)
!11 = distinct !DILexicalBlock(line: 2, column: 25, file: !20, scope: !1)
!12 = !DILocation(line: 2, column: 13, scope: !1)
!13 = !DILocation(line: 2, column: 22, scope: !1)
@@ -46,7 +46,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnon
!15 = !DILocation(line: 4, column: 3, scope: !11)
!16 = !DILocation(line: 5, column: 5, scope: !11)
!17 = !DILocation(line: 7, column: 1, scope: !11)
-!18 = !{!1}
!19 = !{!6, !7, !10}
!20 = !DIFile(filename: "a.c", directory: "/private/tmp")
!21 = !{}
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 70af4184e8a2..b50253bf2b03 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
@@ -147,7 +148,8 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) n
; CHECK-LABEL: merge_loads_i16:
; load:
-; CHECK: movw
+; BWON: movzwl
+; BWOFF: movw
; store:
; CHECK: movw
; CHECK: ret
@@ -180,9 +182,11 @@ define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struc
; The loads and the stores are interleaved. Can't merge them.
; CHECK-LABEL: no_merge_loads:
+; BWON: movzbl
+; BWOFF: movb
; CHECK: movb
-; CHECK: movb
-; CHECK: movb
+; BWON: movzbl
+; BWOFF: movb
; CHECK: movb
; CHECK: ret
define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
@@ -337,8 +341,9 @@ block4: ; preds = %4, %.lr.ph
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy.
; CHECK-LABEL: MergeLoadStoreBaseIndexOffset:
-; CHECK: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
-; CHECK: movw [[REG]], (%{{.*}})
+; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
+; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
+; CHECK: movw %[[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
br label %1
@@ -369,8 +374,9 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
; word (16 bit) instead of a byte copy even if there are intermediate sign
; extensions.
; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext:
-; CHECK: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
-; CHECK: movw [[REG]], (%{{.*}})
+; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
+; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
+; CHECK: movw %[[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
br label %1
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 91fe7f819383..15be7aa1029f 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -21,15 +21,16 @@ for.body:
br label %for.body
}
+
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!23}
-!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: 0, file: !1, enums: !{}, retainedTypes: !{})
+!0 = distinct !DICompileUnit(language: DW_LANG_C89, producer: "clang", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !{}, retainedTypes: !{})
!1 = !DIFile(filename: "t.c", directory: "")
!16 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!2 = distinct !DISubprogram()
+!2 = distinct !DISubprogram(unit: !0)
!22 = !DILocalVariable(name: "x", line: 16, scope: !2, file: !1, type: !16)
!23 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index 634f66ad52de..f974cdc30a21 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -1,4 +1,5 @@
; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR --check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false -stackcoloring-lifetime-start-on-first-use=false < %s | FileCheck %s --check-prefix=NOFIRSTUSE --check-prefix=CHECK
; RUN: llc -mcpu=corei7 -no-stack-coloring=true < %s | FileCheck %s --check-prefix=NOCOLOR --check-prefix=CHECK
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -87,7 +88,8 @@ bb3:
}
;CHECK-LABEL: myCall_w4:
-;YESCOLOR: subq $200, %rsp
+;YESCOLOR: subq $120, %rsp
+;NOFIRSTUSE: subq $200, %rsp
;NOCOLOR: subq $408, %rsp
define i32 @myCall_w4(i32 %in) {
@@ -217,7 +219,7 @@ bb3:
;CHECK-LABEL: myCall2_nostart:
-;YESCOLOR: subq $144, %rsp
+;YESCOLOR: subq $272, %rsp
;NOCOLOR: subq $272, %rsp
define i32 @myCall2_nostart(i32 %in, i1 %d) {
entry:
@@ -243,8 +245,8 @@ bb3:
; Adopt the test from Transforms/Inline/array_merge.ll'
;CHECK-LABEL: array_merge:
-;YESCOLOR: subq $816, %rsp
-;NOCOLOR: subq $1616, %rsp
+;YESCOLOR: subq $808, %rsp
+;NOCOLOR: subq $1608, %rsp
define void @array_merge() nounwind ssp {
entry:
%A.i1 = alloca [100 x i32], align 4
@@ -306,6 +308,9 @@ bb3:
;CHECK-LABEL: multi_region_bb:
+;YESCOLOR: subq $272, %rsp
+;NOCOLOR: subq $272, %rsp
+
define void @multi_region_bb() nounwind ssp {
entry:
%A.i1 = alloca [100 x i32], align 4
@@ -330,8 +335,6 @@ entry:
call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
ret void
}
-;YESCOLOR: subq $272, %rsp
-;NOCOLOR: subq $272, %rsp
define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
entry:
@@ -360,7 +363,7 @@ bb3:
; Regression test for PR15707. %buf1 and %buf2 should not be merged
; in this test case.
;CHECK-LABEL: myCall_pr15707:
-;YESCOLOR: subq $200008, %rsp
+;NOFIRSTUSE: subq $200008, %rsp
;NOCOLOR: subq $200008, %rsp
define void @myCall_pr15707() {
%buf1 = alloca i8, i32 100000, align 16
@@ -425,6 +428,164 @@ define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
ret i32 9
}
+; In this case 'itar1' and 'itar2' can't be overlapped if we treat
+; lifetime.start as the beginning of the lifetime, but we can
+; overlap if we consider first use of the slot as lifetime
+; start. See llvm bug 25776.
+
+;CHECK-LABEL: ifthen_twoslots:
+;YESCOLOR: subq $1544, %rsp
+;NOFIRSTUSE: subq $2056, %rsp
+;NOCOLOR: subq $2568, %rsp
+
+define i32 @ifthen_twoslots(i32 %x) #0 {
+entry:
+ %b1 = alloca [128 x i32], align 16
+ %b2 = alloca [128 x i32], align 16
+ %b3 = alloca [128 x i32], align 16
+ %b4 = alloca [128 x i32], align 16
+ %b5 = alloca [128 x i32], align 16
+ %tmp = bitcast [128 x i32]* %b1 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp)
+ %tmp1 = bitcast [128 x i32]* %b2 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp1)
+ %and = and i32 %x, 1
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %tmp2 = bitcast [128 x i32]* %b3 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp2)
+ %a1 = getelementptr inbounds [128 x i32], [128 x i32]* %b1, i64 0, i64 0
+ %a2 = getelementptr inbounds [128 x i32], [128 x i32]* %b3, i64 0, i64 0
+ call void @initb(i32* %a1, i32* %a2, i32* null)
+ call void @llvm.lifetime.end(i64 512, i8* %tmp2)
+ br label %if.end
+
+if.else: ; preds = %entry
+ %tmp3 = bitcast [128 x i32]* %b4 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp3)
+ %tmp4 = bitcast [128 x i32]* %b5 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp4)
+ %a3 = getelementptr inbounds [128 x i32], [128 x i32]* %b2, i64 0, i64 0
+ %a4 = getelementptr inbounds [128 x i32], [128 x i32]* %b4, i64 0, i64 0
+ %a5 = getelementptr inbounds [128 x i32], [128 x i32]* %b5, i64 0, i64 0
+ call void @initb(i32* %a3, i32* %a4, i32* %a5) #3
+ call void @llvm.lifetime.end(i64 512, i8* %tmp4)
+ call void @llvm.lifetime.end(i64 512, i8* %tmp3)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ call void @llvm.lifetime.end(i64 512, i8* %tmp1)
+ call void @llvm.lifetime.end(i64 512, i8* %tmp)
+ ret i32 0
+
+}
+
+; This function is intended to test the case where you
+; have a reference to a stack slot that lies outside of
+; the START/END lifetime markers-- the flow analysis
+; should catch this and build the lifetime based on the
+; markers only.
+
+;CHECK-LABEL: while_loop:
+;YESCOLOR: subq $1032, %rsp
+;NOFIRSTUSE: subq $1544, %rsp
+;NOCOLOR: subq $1544, %rsp
+
+define i32 @while_loop(i32 %x) #0 {
+entry:
+ %b1 = alloca [128 x i32], align 16
+ %b2 = alloca [128 x i32], align 16
+ %b3 = alloca [128 x i32], align 16
+ %tmp = bitcast [128 x i32]* %b1 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp) #3
+ %tmp1 = bitcast [128 x i32]* %b2 to i8*
+ call void @llvm.lifetime.start(i64 512, i8* %tmp1) #3
+ %and = and i32 %x, 1
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %arraydecay = getelementptr inbounds [128 x i32], [128 x i32]* %b2, i64 0, i64 0
+ call void @inita(i32* %arraydecay) #3
+ br label %if.end
+
+if.else: ; preds = %entry
+ %arraydecay1 = getelementptr inbounds [128 x i32], [128 x i32]* %b1, i64 0, i64 0
+ call void @inita(i32* %arraydecay1) #3
+ %arraydecay3 = getelementptr inbounds [128 x i32], [128 x i32]* %b3, i64 0, i64 0
+ call void @inita(i32* %arraydecay3) #3
+ %tobool25 = icmp eq i32 %x, 0
+ br i1 %tobool25, label %if.end, label %while.body.lr.ph
+
+while.body.lr.ph: ; preds = %if.else
+ %tmp2 = bitcast [128 x i32]* %b3 to i8*
+ br label %while.body
+
+while.body: ; preds = %while.body.lr.ph, %while.body
+ %x.addr.06 = phi i32 [ %x, %while.body.lr.ph ], [ %dec, %while.body ]
+ %dec = add nsw i32 %x.addr.06, -1
+ call void @llvm.lifetime.start(i64 512, i8* %tmp2) #3
+ call void @inita(i32* %arraydecay3) #3
+ call void @llvm.lifetime.end(i64 512, i8* %tmp2) #3
+ %tobool2 = icmp eq i32 %dec, 0
+ br i1 %tobool2, label %if.end.loopexit, label %while.body
+
+if.end.loopexit: ; preds = %while.body
+ br label %if.end
+
+if.end: ; preds = %if.end.loopexit, %if.else, %if.then
+ call void @llvm.lifetime.end(i64 512, i8* %tmp1) #3
+ call void @llvm.lifetime.end(i64 512, i8* %tmp) #3
+ ret i32 0
+}
+
+; Test case motivated by PR27903. Same routine inlined multiple times
+; into a caller results in a multi-segment lifetime, but the second
+; lifetime has no explicit references to the stack slot. Such slots
+; have to be treated conservatively.
+
+;CHECK-LABEL: twobod_b27903:
+;YESCOLOR: subq $96, %rsp
+;NOFIRSTUSE: subq $96, %rsp
+;NOCOLOR: subq $96, %rsp
+
+define i32 @twobod_b27903(i32 %y, i32 %x) {
+entry:
+ %buffer.i = alloca [12 x i32], align 16
+ %abc = alloca [12 x i32], align 16
+ %tmp = bitcast [12 x i32]* %buffer.i to i8*
+ call void @llvm.lifetime.start(i64 48, i8* %tmp)
+ %idxprom.i = sext i32 %y to i64
+ %arrayidx.i = getelementptr inbounds [12 x i32], [12 x i32]* %buffer.i, i64 0, i64 %idxprom.i
+ call void @inita(i32* %arrayidx.i)
+ %add.i = add nsw i32 %x, %y
+ call void @llvm.lifetime.end(i64 48, i8* %tmp)
+ %tobool = icmp eq i32 %y, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %tmp1 = bitcast [12 x i32]* %abc to i8*
+ call void @llvm.lifetime.start(i64 48, i8* %tmp1)
+ %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %abc, i64 0, i64 %idxprom.i
+ call void @inita(i32* %arrayidx)
+ call void @llvm.lifetime.start(i64 48, i8* %tmp)
+ call void @inita(i32* %arrayidx.i)
+ %add.i9 = add nsw i32 %add.i, %y
+ call void @llvm.lifetime.end(i64 48, i8* %tmp)
+ call void @llvm.lifetime.end(i64 48, i8* %tmp1)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %x.addr.0 = phi i32 [ %add.i9, %if.then ], [ %add.i, %entry ]
+ ret i32 %x.addr.0
+}
+
+declare void @inita(i32*)
+
+declare void @initb(i32*,i32*,i32*)
+
declare void @bar([100 x i32]* , [100 x i32]*) nounwind
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/WidenArith.ll b/test/CodeGen/X86/WidenArith.ll
index f87b3821dde8..cdd1a2818b2f 100644
--- a/test/CodeGen/X86/WidenArith.ll
+++ b/test/CodeGen/X86/WidenArith.ll
@@ -1,15 +1,17 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-;CHECK-LABEL: test:
-;CHECK: vaddps
-;CHECK: vmulps
-;CHECK: vsubps
-;CHECK: vcmpltps
-;CHECK: vcmpltps
-;CHECK: vandps
-;CHECK: vandps
-;CHECK: ret
define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: test:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm2
+; CHECK-NEXT: vmulps %ymm0, %ymm1, %ymm1
+; CHECK-NEXT: vsubps %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vcmpltps %ymm3, %ymm2, %ymm1
+; CHECK-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%c1 = fadd <8 x float> %a, %b
%b1 = fmul <8 x float> %b, %a
%d = fsub <8 x float> %b1, %c1
diff --git a/test/CodeGen/X86/abi-isel.ll b/test/CodeGen/X86/abi-isel.ll
index f363b64386f5..742041a974b3 100644
--- a/test/CodeGen/X86/abi-isel.ll
+++ b/test/CodeGen/X86/abi-isel.ll
@@ -5,8 +5,8 @@
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC
; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
@@ -8425,25 +8425,25 @@ entry:
; DARWIN-32-DYNAMIC: _lcallee:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_x$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
+; DARWIN-32-DYNAMIC-NEXT: calll _x
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _lcallee:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
-; DARWIN-32-PIC-NEXT: calll L_x$stub
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
+; DARWIN-32-PIC-NEXT: calll _x
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
@@ -8557,25 +8557,25 @@ entry:
; DARWIN-32-DYNAMIC: _dcallee:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_y$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
+; DARWIN-32-DYNAMIC-NEXT: calll _y
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _dcallee:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
-; DARWIN-32-PIC-NEXT: calll L_y$stub
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
+; DARWIN-32-PIC-NEXT: calll _y
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
@@ -8802,15 +8802,15 @@ entry:
; DARWIN-32-DYNAMIC: _caller:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_callee$stub
-; DARWIN-32-DYNAMIC-NEXT: calll L_callee$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _callee
+; DARWIN-32-DYNAMIC-NEXT: calll _callee
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _caller:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_callee$stub
-; DARWIN-32-PIC-NEXT: calll L_callee$stub
+; DARWIN-32-PIC-NEXT: calll _callee
+; DARWIN-32-PIC-NEXT: calll _callee
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
@@ -9021,13 +9021,13 @@ entry:
; DARWIN-32-DYNAMIC: _tailcaller:
; DARWIN-32-DYNAMIC: subl $12, %esp
-; DARWIN-32-DYNAMIC-NEXT: calll L_callee$stub
+; DARWIN-32-DYNAMIC-NEXT: calll _callee
; DARWIN-32-DYNAMIC-NEXT: addl $12, %esp
; DARWIN-32-DYNAMIC-NEXT: ret
; DARWIN-32-PIC: _tailcaller:
; DARWIN-32-PIC: subl $12, %esp
-; DARWIN-32-PIC-NEXT: calll L_callee$stub
+; DARWIN-32-PIC-NEXT: calll _callee
; DARWIN-32-PIC-NEXT: addl $12, %esp
; DARWIN-32-PIC-NEXT: ret
diff --git a/test/CodeGen/X86/add-nsw-sext.ll b/test/CodeGen/X86/add-nsw-sext.ll
index 0a6f6c315c13..658c58b3d61b 100644
--- a/test/CodeGen/X86/add-nsw-sext.ll
+++ b/test/CodeGen/X86/add-nsw-sext.ll
@@ -25,7 +25,7 @@ define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_add:
; CHECK: # BB#0:
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq 5(%rax,%rsi), %rax
+; CHECK-NEXT: leaq 5(%rsi,%rax), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
@@ -72,7 +72,7 @@ define i8* @gep8(i32 %i, i8* %x) {
; CHECK-LABEL: gep8:
; CHECK: # BB#0:
; CHECK-NEXT: movslq %edi, %rax
-; CHECK-NEXT: leaq 5(%rax,%rsi), %rax
+; CHECK-NEXT: leaq 5(%rsi,%rax), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
@@ -127,7 +127,7 @@ define i128* @gep128(i32 %i, i128* %x) {
; CHECK: # BB#0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: shlq $4, %rax
-; CHECK-NEXT: leaq 80(%rax,%rsi), %rax
+; CHECK-NEXT: leaq 80(%rsi,%rax), %rax
; CHECK-NEXT: retq
%add = add nsw i32 %i, 5
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index 62a62a460bd7..df1bc9b6ee7e 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -148,3 +148,39 @@ entry:
; X64: incl
; X64-NEXT: seto
}
+
+define void @test11(i32* inreg %a) nounwind {
+ %aa = load i32, i32* %a
+ %b = add i32 %aa, 128
+ store i32 %b, i32* %a
+ ret void
+; X32-LABEL: test11:
+; X32: subl $-128, (%
+; X64-LABEL: test11:
+; X64: subl $-128, (%
+}
+
+define void @test12(i64* inreg %a) nounwind {
+ %aa = load i64, i64* %a
+ %b = add i64 %aa, 2147483648
+ store i64 %b, i64* %a
+ ret void
+; X32-LABEL: test12:
+; X32: addl (%
+; X32-NEXT: adcl $0,
+; X64-LABEL: test12:
+; X64: subq $-2147483648, (%
+}
+
+define void @test13(i64* inreg %a) nounwind {
+ %aa = load i64, i64* %a
+ %b = add i64 %aa, 128
+ store i64 %b, i64* %a
+ ret void
+
+; X32-LABEL: test13:
+; X32: addl (%
+; X32-NEXT: adcl $0,
+; X64-LABEL: test13:
+; X64: subq $-128, (%
+}
diff --git a/test/CodeGen/X86/alias-gep.ll b/test/CodeGen/X86/alias-gep.ll
new file mode 100644
index 000000000000..5ecf20ba78ed
--- /dev/null
+++ b/test/CodeGen/X86/alias-gep.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=MACHO %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck --check-prefix=ELF %s
+
+;MACHO: .globl _offsetSym0
+;MACHO-NOT: .alt_entry
+;MACHO: _offsetSym0 = _s
+;MACHO: .globl _offsetSym1
+;MACHO: .alt_entry _offsetSym1
+;MACHO: _offsetSym1 = _s+8
+
+;ELF: .globl offsetSym0
+;ELF-NOT: .alt_entry
+;ELF: offsetSym0 = s
+;ELF: .globl offsetSym1
+;ELF-NOT: .alt_entry
+;ELF: offsetSym1 = s+8
+
+%struct.S1 = type { i32, i32, i32 }
+
+@s = global %struct.S1 { i32 31, i32 32, i32 33 }, align 4
+@offsetSym0 = alias i32, i32* getelementptr inbounds (%struct.S1, %struct.S1* @s, i64 0, i32 0)
+@offsetSym1 = alias i32, i32* getelementptr inbounds (%struct.S1, %struct.S1* @s, i64 0, i32 2)
diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll
index 294159220626..1ea57296a707 100644
--- a/test/CodeGen/X86/aligned-variadic.ll
+++ b/test/CodeGen/X86/aligned-variadic.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X32
%struct.Baz = type { [17 x i8] }
%struct.__va_list_tag = type { i32, i32, i8*, i8* }
diff --git a/test/CodeGen/X86/alignment.ll b/test/CodeGen/X86/alignment.ll
index 5908c0cde61e..acf11fdec494 100644
--- a/test/CodeGen/X86/alignment.ll
+++ b/test/CodeGen/X86/alignment.ll
@@ -6,7 +6,7 @@
; CHECK: .bss
; CHECK: .globl GlobalA
-; CHECK: .align 8
+; CHECK: .p2align 3
; CHECK: GlobalA:
; CHECK: .zero 384
@@ -29,7 +29,7 @@
@GlobalAS = global { [384 x i8] } zeroinitializer, align 8, section "foo"
; CHECK: .globl GlobalAS
-; CHECK: .align 8
+; CHECK: .p2align 3
; CHECK: GlobalAS:
; CHECK: .zero 384
diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll
index 10fecadaa023..9707eb57ae47 100644
--- a/test/CodeGen/X86/all-ones-vector.ll
+++ b/test/CodeGen/X86/all-ones-vector.ll
@@ -1,14 +1,143 @@
-; RUN: llc < %s -march=x86 -mattr=sse2 | grep pcmpeqd | count 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX
+
+define <16 x i8> @coo() nounwind {
+; X32-SSE-LABEL: coo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: coo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: coo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: coo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+}
+
+define <8 x i16> @soo() nounwind {
+; X32-SSE-LABEL: soo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: soo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: soo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: soo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+}
define <4 x i32> @ioo() nounwind {
- ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; X32-SSE-LABEL: ioo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: ioo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: ioo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: ioo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
}
+
define <2 x i64> @loo() nounwind {
- ret <2 x i64> <i64 -1, i64 -1>
+; X32-SSE-LABEL: loo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: loo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: loo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: loo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <2 x i64> <i64 -1, i64 -1>
}
+
define <2 x double> @doo() nounwind {
- ret <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
+; X32-SSE-LABEL: doo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: doo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: doo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: doo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
}
+
define <4 x float> @foo() nounwind {
- ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
+; X32-SSE-LABEL: foo:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: foo:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: foo:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: foo:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
}
diff --git a/test/CodeGen/X86/and-encoding.ll b/test/CodeGen/X86/and-encoding.ll
index f7bbac2a4bd9..1a90bd0d6eb7 100644
--- a/test/CodeGen/X86/and-encoding.ll
+++ b/test/CodeGen/X86/and-encoding.ll
@@ -15,27 +15,18 @@ define void @f1() {
ret void
}
-define void @f2(i1 *%x, i16 *%y) {
+define void @f2(i16 %x, i1 *%y) {
; CHECK-LABEL: f2:
-; CHECK: andl $1, %eax # encoding: [0x83,0xe0,0x01]
- %a = load i1, i1* %x
- %b = zext i1 %a to i16
- store i16 %b, i16* %y
+; CHECK: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+ %c = trunc i16 %x to i1
+ store i1 %c, i1* %y
ret void
}
-define i32 @f3(i1 *%x) {
+define void @f3(i32 %x, i1 *%y) {
; CHECK-LABEL: f3:
-; CHECK: andl $1, %eax # encoding: [0x83,0xe0,0x01]
- %a = load i1, i1* %x
- %b = zext i1 %a to i32
- ret i32 %b
-}
-
-define i64 @f4(i1 *%x) {
-; CHECK-LABEL: f4:
-; CHECK: andl $1, %eax # encoding: [0x83,0xe0,0x01]
- %a = load i1, i1* %x
- %b = zext i1 %a to i64
- ret i64 %b
+; CHECK: andl $1, %edi # encoding: [0x83,0xe7,0x01]
+ %c = trunc i32 %x to i1
+ store i1 %c, i1* %y
+ ret void
}
diff --git a/test/CodeGen/X86/anyext.ll b/test/CodeGen/X86/anyext.ll
index 106fe83661b4..4f4218bdd63d 100644
--- a/test/CodeGen/X86/anyext.ll
+++ b/test/CodeGen/X86/anyext.ll
@@ -1,15 +1,52 @@
-; RUN: llc < %s -march=x86-64 | grep movzbl | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
; Use movzbl to avoid partial-register updates.
define i32 @foo(i32 %p, i8 zeroext %x) nounwind {
+; X32-LABEL: foo:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: divb {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: divb %sil
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
%q = trunc i32 %p to i8
%r = udiv i8 %q, %x
%s = zext i8 %r to i32
%t = and i32 %s, 1
ret i32 %t
}
+
define i32 @bar(i32 %p, i16 zeroext %x) nounwind {
+; X32-LABEL: bar:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: divw {{[0-9]+}}(%esp)
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<def>
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: bar:
+; X64: # BB#0:
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: divw %si
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<def>
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
%q = trunc i32 %p to i16
%r = udiv i16 %q, %x
%s = zext i16 %r to i32
diff --git a/test/CodeGen/X86/atom-lea-sp.ll b/test/CodeGen/X86/atom-lea-sp.ll
index 1ee3b00ee87e..25da6b30adfe 100644
--- a/test/CodeGen/X86/atom-lea-sp.ll
+++ b/test/CodeGen/X86/atom-lea-sp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck -check-prefix=ATOM %s
-; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck %s
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux -no-x86-call-frame-opt | FileCheck %s
declare void @use_arr(i8*)
declare void @many_params(i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/X86/atomic-eflags-reuse.ll b/test/CodeGen/X86/atomic-eflags-reuse.ll
new file mode 100644
index 000000000000..dc1814b55cd3
--- /dev/null
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define i32 @test_add_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_slt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock incq (%rdi)
+; CHECK-NEXT: cmovgl %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp slt i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_add_1_cmov_sge(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_sge:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock incq (%rdi)
+; CHECK-NEXT: cmovlel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sge i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_sub_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_sub_1_cmov_sle:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock decq (%rdi)
+; CHECK-NEXT: cmovgel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sle i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_sub_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_sub_1_cmov_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock decq (%rdi)
+; CHECK-NEXT: cmovll %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+; FIXME: (setcc slt x, 0) gets combined into shr early.
+define i8 @test_add_1_setcc_slt(i64* %p) #0 {
+; CHECK-LABEL: test_add_1_setcc_slt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: shrq $63, %rax
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %RAX<kill>
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp slt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i8 @test_sub_1_setcc_sgt(i64* %p) #0 {
+; CHECK-LABEL: test_sub_1_setcc_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock decq (%rdi)
+; CHECK-NEXT: setge %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i32 @test_add_1_brcond_sge(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_brcond_sge:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: lock incq (%rdi)
+; CHECK-NEXT: jle .LBB6_2
+; CHECK-NEXT: # BB#1: # %t
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB6_2: # %f
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sge i64 %tmp0, 0
+ br i1 %tmp1, label %t, label %f
+t:
+ ret i32 %a0
+f:
+ ret i32 %a1
+}
+
+; Also make sure we don't muck with condition codes that we should ignore.
+; No need to test unsigned comparisons, as they should all be simplified.
+
+define i32 @test_add_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_sle:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovgl %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sle i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+define i32 @test_add_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
+; CHECK-LABEL: test_add_1_cmov_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmovlel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = select i1 %tmp1, i32 %a0, i32 %a1
+ ret i32 %tmp2
+}
+
+; Test a result being used by more than just the comparison.
+
+define i8 @test_add_1_setcc_sgt_reuse(i64* %p, i64* %p2) #0 {
+; CHECK-LABEL: test_add_1_setcc_sgt_reuse:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: lock xaddq %rcx, (%rdi)
+; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: movq %rcx, (%rsi)
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ store i64 %tmp0, i64* %p2
+ ret i8 %tmp2
+}
+
+define i8 @test_sub_2_setcc_sgt(i64* %p) #0 {
+; CHECK-LABEL: test_sub_2_setcc_sgt:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq $-2, %rax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 2 seq_cst
+ %tmp1 = icmp sgt i64 %tmp0, 0
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/atomic-non-integer.ll b/test/CodeGen/X86/atomic-non-integer.ll
index 98fcd96d3e4c..17b73ecf4e1c 100644
--- a/test/CodeGen/X86/atomic-non-integer.ll
+++ b/test/CodeGen/X86/atomic-non-integer.ll
@@ -43,7 +43,7 @@ define half @load_half(half* %fptr) {
; CHECK-LABEL: @load_half
; CHECK: movw (%rdi), %ax
; CHECK: movzwl %ax, %edi
-; CHECK: jmp __gnu_h2f_ieee
+; CHECK: callq __gnu_h2f_ieee
%v = load atomic half, half* %fptr unordered, align 2
ret half %v
}
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
index c41269b0b606..1bf7bfbfa260 100644
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@@ -4,9 +4,14 @@
define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
; CHECK-LABEL: val_compare_and_swap:
+; Due to the scheduling right after isel for cmpxchg and given the
+; machine scheduler and copy coalescer do not mess up with physical
+; register live-ranges, we end up with a useless copy.
+;
+; CHECK: movq %rcx, [[TMP:%r[0-9a-z]+]]
; CHECK: movq %rsi, %rax
-; CHECK: movq %rcx, %rbx
; CHECK: movq %r8, %rcx
+; CHECK: movq [[TMP]], %rbx
; CHECK: lock
; CHECK: cmpxchg16b (%rdi)
@@ -216,8 +221,8 @@ define i128 @atomic_load_seq_cst(i128* %p) {
; CHECK-LABEL: atomic_load_seq_cst:
; CHECK: xorl %eax, %eax
; CHECK: xorl %edx, %edx
-; CHECK: xorl %ebx, %ebx
; CHECK: xorl %ecx, %ecx
+; CHECK: xorl %ebx, %ebx
; CHECK: lock
; CHECK: cmpxchg16b (%rdi)
@@ -229,8 +234,8 @@ define i128 @atomic_load_relaxed(i128* %p) {
; CHECK: atomic_load_relaxed:
; CHECK: xorl %eax, %eax
; CHECK: xorl %edx, %edx
-; CHECK: xorl %ebx, %ebx
; CHECK: xorl %ecx, %ecx
+; CHECK: xorl %ebx, %ebx
; CHECK: lock
; CHECK: cmpxchg16b (%rdi)
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
index f6892de43d89..90716cc3984f 100644
--- a/test/CodeGen/X86/atomic16.ll
+++ b/test/CodeGen/X86/atomic16.ll
@@ -154,17 +154,19 @@ define void @atomic_fetch_nand16(i16 %x) nounwind {
}
define void @atomic_fetch_max16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_max16
+; X32-LABEL: atomic_fetch_max16
%t1 = atomicrmw max i16* @sc16, i16 %x acquire
-; X64: movswl
-; X64: movswl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movswl
-; X32: movswl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
@@ -174,17 +176,19 @@ define void @atomic_fetch_max16(i16 %x) nounwind {
}
define void @atomic_fetch_min16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_min16
+; X32-LABEL: atomic_fetch_min16
%t1 = atomicrmw min i16* @sc16, i16 %x acquire
-; X64: movswl
-; X64: movswl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movswl
-; X32: movswl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
@@ -194,17 +198,19 @@ define void @atomic_fetch_min16(i16 %x) nounwind {
}
define void @atomic_fetch_umax16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_umax16
+; X32-LABEL: atomic_fetch_umax16
%t1 = atomicrmw umax i16* @sc16, i16 %x acquire
-; X64: movzwl
-; X64: movzwl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movzwl
-; X32: movzwl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
@@ -214,17 +220,19 @@ define void @atomic_fetch_umax16(i16 %x) nounwind {
}
define void @atomic_fetch_umin16(i16 %x) nounwind {
+; X64-LABEL: atomic_fetch_umin16
+; X32-LABEL: atomic_fetch_umin16
%t1 = atomicrmw umin i16* @sc16, i16 %x acquire
-; X64: movzwl
-; X64: movzwl
-; X64: subl
+; X64: movw
+; X64: movw
+; X64: subw
; X64: cmov
; X64: lock
; X64: cmpxchgw
-; X32: movzwl
-; X32: movzwl
-; X32: subl
+; X32: movw
+; X32: movw
+; X32: subw
; X32: cmov
; X32: lock
; X32: cmpxchgw
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
index 5eef9b295e80..01123ae9b073 100644
--- a/test/CodeGen/X86/atomic8.ll
+++ b/test/CodeGen/X86/atomic8.ll
@@ -157,15 +157,15 @@ define void @atomic_fetch_max8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_max8:
; X32-LABEL: atomic_fetch_max8:
%t1 = atomicrmw max i8* @sc8, i8 %x acquire
-; X64: movsbl
-; X64: movsbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movsbl
-; X32: movsbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
@@ -177,15 +177,15 @@ define void @atomic_fetch_min8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_min8:
; X32-LABEL: atomic_fetch_min8:
%t1 = atomicrmw min i8* @sc8, i8 %x acquire
-; X64: movsbl
-; X64: movsbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movsbl
-; X32: movsbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
@@ -197,15 +197,15 @@ define void @atomic_fetch_umax8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_umax8:
; X32-LABEL: atomic_fetch_umax8:
%t1 = atomicrmw umax i8* @sc8, i8 %x acquire
-; X64: movzbl
-; X64: movzbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movzbl
-; X32: movzbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
@@ -217,15 +217,15 @@ define void @atomic_fetch_umin8(i8 %x) nounwind {
; X64-LABEL: atomic_fetch_umin8:
; X32-LABEL: atomic_fetch_umin8:
%t1 = atomicrmw umin i8* @sc8, i8 %x acquire
-; X64: movzbl
-; X64: movzbl
-; X64: subl
+; X64: movb
+; X64: movb
+; X64: subb
; X64: lock
; X64: cmpxchgb
-; X32: movzbl
-; X32: movzbl
-; X32: subl
+; X32: movb
+; X32: movb
+; X32: subb
; X32: lock
; X32: cmpxchgb
ret void
diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll
index 356d9dcff6fa..e9f1b59ac589 100644
--- a/test/CodeGen/X86/atomic_mi.ll
+++ b/test/CodeGen/X86/atomic_mi.ll
@@ -979,3 +979,20 @@ define void @fadd_64stack() {
store atomic i64 %bc1, i64* %ptr release, align 8
ret void
}
+
+define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) {
+; X64-LABEL: fadd_array:
+; X64-NOT: lock
+; X64: addsd ([[ADDR:%r..,%r..,8]]), %[[XMM:xmm[0-9]+]]
+; X64-NEXT: movsd %[[XMM]], ([[ADDR]])
+; X32-LABEL: fadd_array:
+; Don't check x86-32 (see comment above).
+bb:
+ %tmp4 = getelementptr inbounds i64, i64* %arg, i64 %arg2
+ %tmp6 = load atomic i64, i64* %tmp4 monotonic, align 8
+ %tmp7 = bitcast i64 %tmp6 to double
+ %tmp8 = fadd double %tmp7, %arg1
+ %tmp9 = bitcast double %tmp8 to i64
+ store atomic i64 %tmp9, i64* %tmp4 monotonic, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/avoid-loop-align.ll b/test/CodeGen/X86/avoid-loop-align.ll
index d82cf9418e64..9895b30800ec 100644
--- a/test/CodeGen/X86/avoid-loop-align.ll
+++ b/test/CodeGen/X86/avoid-loop-align.ll
@@ -4,7 +4,7 @@
; header in this case.
; CHECK: jmp LBB0_2
-; CHECK: .align
+; CHECK: .p2align
; CHECK: LBB0_1:
@A = common global [100 x i32] zeroinitializer, align 32 ; <[100 x i32]*> [#uses=1]
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 6857bb8bd112..b05dc71c175a 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -1,30 +1,44 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s
@x = common global <8 x float> zeroinitializer, align 32
@y = common global <4 x double> zeroinitializer, align 32
@z = common global <4 x float> zeroinitializer, align 16
define void @zero128() nounwind ssp {
-entry:
- ; CHECK: vxorps
- ; CHECK: vmovaps
+; CHECK-LABEL: zero128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax
+; CHECK-NEXT: vmovaps %xmm0, (%rax)
+; CHECK-NEXT: retq
store <4 x float> zeroinitializer, <4 x float>* @z, align 16
ret void
}
define void @zero256() nounwind ssp {
-entry:
- ; CHECK: vxorps
- ; CHECK: vmovaps
- ; CHECK: vmovaps
+; CHECK-LABEL: zero256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movq _x@{{.*}}(%rip), %rax
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+; CHECK-NEXT: movq _y@{{.*}}(%rip), %rax
+; CHECK-NEXT: vmovaps %ymm0, (%rax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* @x, align 32
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
ret void
}
-; CHECK: vpcmpeqd
-; CHECK: vinsertf128 $1
define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
+; CHECK-LABEL: ones:
+; CHECK: ## BB#0: ## %allocas
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
allocas:
%ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
@@ -34,9 +48,14 @@ float>* %ptr2vec615, align 32
ret void
}
-; CHECK: vpcmpeqd
-; CHECK: vinsertf128 $1
define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind {
+; CHECK-LABEL: ones2:
+; CHECK: ## BB#0: ## %allocas
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovaps %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
allocas:
%ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>*
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32
@@ -44,18 +63,22 @@ allocas:
}
;;; Just make sure this doesn't crash
-; CHECK: _ISelCrash
define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: ISelCrash:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
ret <4 x i64> %shuffle
}
;;; Don't crash on movd
-; CHECK: _VMOVZQI2PQI
-; CHECK: vmovd (%
define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
-allocas:
+; CHECK-LABEL: VMOVZQI2PQI:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: retq
%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4
%ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1
@@ -67,35 +90,45 @@ allocas:
;;;; Don't crash on fneg
; rdar://10566486
-; CHECK: fneg
-; CHECK: vxorps
define <16 x float> @fneg(<16 x float> %a) nounwind {
+; CHECK-LABEL: fneg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: retq
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
ret <16 x float> %1
}
;;; Don't crash on build vector
-; CHECK: @build_vec_16x16
-; CHECK: vmovd
define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
+; CHECK-LABEL: build_vec_16x16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: retq
%res = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a, i32 0
ret <16 x i16> %res
}
;;; Check that VMOVPQIto64rr generates the assembly string "vmovq". Previously
;;; an incorrect mnemonic of "movd" was printed for this instruction.
-; CHECK: VMOVPQIto64rr
-; CHECK: vmovq
define i64 @VMOVPQIto64rr(<2 x i64> %a) {
-entry:
+; CHECK-LABEL: VMOVPQIto64rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: retq
%vecext.i = extractelement <2 x i64> %a, i32 0
ret i64 %vecext.i
}
; PR22685
-; CHECK: mov00
-; CHECK: vmovss
define <8 x float> @mov00_8f32(float* %ptr) {
+; CHECK-LABEL: mov00_8f32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq
%val = load float, float* %ptr
%vec = insertelement <8 x float> zeroinitializer, float %val, i32 0
ret <8 x float> %vec
diff --git a/test/CodeGen/X86/avx-cast.ll b/test/CodeGen/X86/avx-cast.ll
index 34c5dfaa0162..103715c3628e 100644
--- a/test/CodeGen/X86/avx-cast.ll
+++ b/test/CodeGen/X86/avx-cast.ll
@@ -9,6 +9,7 @@
define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castA:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: retq
@@ -19,6 +20,7 @@ define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castB:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: retq
@@ -31,12 +33,14 @@ define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castC:
; AVX1: ## BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: castC:
; AVX2: ## BB#0:
+; AVX2-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
@@ -50,6 +54,7 @@ define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castD:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -59,6 +64,7 @@ define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castE:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
@@ -68,6 +74,7 @@ define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castF:
; AVX: ## BB#0:
+; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 3923ca850d1a..be4920d1122d 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -15,9 +15,10 @@ declare i32 @func_int(i32, i32)
; WIN64: ret
; X32-LABEL: testf16_inp
-; X32: movl %eax, (%esp)
; X32: vaddps {{.*}}, {{%ymm[0-1]}}
; X32: vaddps {{.*}}, {{%ymm[0-1]}}
+; Push is not deemed profitable if we're realigning the stack.
+; X32: {{pushl|movl}} %eax
; X32: call
; X32: ret
@@ -114,8 +115,8 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
; test functions with integer parameters
; pass parameters on stack for 32-bit platform
; X32-LABEL: test_int
-; X32: movl {{.*}}, 4(%esp)
-; X32: movl {{.*}}, (%esp)
+; X32: pushl {{.*}}
+; X32: pushl {{.*}}
; X32: call
; X32: addl {{.*}}, %eax
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..c7cf857e1d44
--- /dev/null
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -0,0 +1,3778 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
+
+define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_add_pd:
+; X32: # BB#0:
+; X32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_pd:
+; X64: # BB#0:
+; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fadd <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_add_ps:
+; X32: # BB#0:
+; X32-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_ps:
+; X64: # BB#0:
+; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fadd <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_addsub_pd:
+; X32: # BB#0:
+; X32-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_addsub_pd:
+; X64: # BB#0:
+; X64-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_addsub_ps:
+; X32: # BB#0:
+; X32-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_addsub_ps:
+; X64: # BB#0:
+; X64-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_and_pd:
+; X32: # BB#0:
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_and_pd:
+; X64: # BB#0:
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %res = and <4 x i64> %1, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_and_ps:
+; X32: # BB#0:
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_and_ps:
+; X64: # BB#0:
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %res = and <8 x i32> %1, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_andnot_pd:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X32-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_andnot_pd:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; X64-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %res = and <4 x i64> %3, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_andnot_ps:
+; X32: # BB#0:
+; X32-NEXT: vandnps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_andnot_ps:
+; X64: # BB#0:
+; X64-NEXT: vandnps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = and <8 x i32> %3, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_blend_pd:
+; X32: # BB#0:
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_pd:
+; X64: # BB#0:
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_blend_ps:
+; X32: # BB#0:
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_ps:
+; X64: # BB#0:
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
+; X32-LABEL: test_mm256_blendv_pd:
+; X32: # BB#0:
+; X32-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blendv_pd:
+; X64: # BB#0:
+; X64-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
+; X32-LABEL: test_mm256_blendv_ps:
+; X32: # BB#0:
+; X32-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blendv_ps:
+; X64: # BB#0:
+; X64-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double>* %a0 to i8*
+ %res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %arg0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly
+
+define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float>* %a0 to i8*
+ %res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %arg0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
+
+define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastsd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_sd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-NEXT: retq
+ %ld = load double, double* %a0
+ %ins0 = insertelement <4 x double> undef, double %ld, i32 0
+ %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
+ %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
+ %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
+ ret <4 x double> %ins3
+}
+
+define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
+; X32-LABEL: test_mm_broadcast_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastss (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcast_ss:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss (%rdi), %xmm0
+; X64-NEXT: retq
+ %ld = load float, float* %a0
+ %ins0 = insertelement <4 x float> undef, float %ld, i32 0
+ %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
+ %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
+ %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
+ ret <4 x float> %ins3
+}
+
+define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
+; X32-LABEL: test_mm256_broadcast_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastss (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcast_ss:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss (%rdi), %ymm0
+; X64-NEXT: retq
+ %ld = load float, float* %a0
+ %ins0 = insertelement <8 x float> undef, float %ld, i32 0
+ %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
+ %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
+ %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
+ %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
+ %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
+ %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
+ %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
+ ret <8 x float> %ins7
+}
+
+define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x double> %a0 to <8 x float>
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd_si256:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd_si256:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x double> %a0 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd128_pd256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd128_pd256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ ret <4 x double> %res
+}
+
+define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_castpd256_pd128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castpd256_pd128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <8 x float> %a0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps_si256:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps_si256:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <8 x float> %a0 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps128_ps256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps128_ps256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x float> %res
+}
+
+define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_castps256_ps128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castps256_ps128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi256_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi256_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x i64> %a0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi256_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi256_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x i64> %a0 to <8 x float>
+ ret <8 x float> %res
+}
+
+define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_castsi256_si128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_castsi256_si128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
+ ret <2 x i64> %res
+}
+
+define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_ceil_pd:
+; X32: # BB#0:
+; X32-NEXT: vroundpd $2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_ceil_pd:
+; X64: # BB#0:
+; X64-NEXT: vroundpd $2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_ceil_ps:
+; X32: # BB#0:
+; X32-NEXT: vroundps $2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_ceil_ps:
+; X64: # BB#0:
+; X64-NEXT: vroundps $2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
+
+define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_pd:
+; X32: # BB#0:
+; X32-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_pd:
+; X64: # BB#0:
+; X64-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_cmp_pd:
+; X32: # BB#0:
+; X32-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmp_pd:
+; X64: # BB#0:
+; X64-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_ps:
+; X32: # BB#0:
+; X32-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_ps:
+; X64: # BB#0:
+; X64-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_cmp_ps:
+; X32: # BB#0:
+; X32-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmp_ps:
+; X64: # BB#0:
+; X64-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_sd:
+; X32: # BB#0:
+; X32-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_sd:
+; X64: # BB#0:
+; X64-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmp_ss:
+; X32: # BB#0:
+; X32-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmp_ss:
+; X64: # BB#0:
+; X64-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtepi32_pd:
+; X32: # BB#0:
+; X32-NEXT: vcvtdq2pd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi32_pd:
+; X64: # BB#0:
+; X64-NEXT: vcvtdq2pd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = sitofp <4 x i32> %arg0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtepi32_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi32_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %arg0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
+ %res = bitcast <4 x i32> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
+
+define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtpd_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtpd2psy %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtpd_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtpd2psy %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
+
+define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtps_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2dq %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtps_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2dq %ymm0, %ymm0
+; X64-NEXT: retq
+ %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
+ %res = bitcast <8 x i32> %cvt to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtps_pd:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2pd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtps_pd:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2pd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = fpext <4 x float> %a0 to <4 x double>
+ ret <4 x double> %res
+}
+
+define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_cvttpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvttpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %cvt = fptosi <4 x double> %a0 to <4 x i32>
+ %res = bitcast <4 x i32> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvttps_epi32:
+; X32: # BB#0:
+; X32-NEXT: vcvttps2dq %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvttps_epi32:
+; X64: # BB#0:
+; X64-NEXT: vcvttps2dq %ymm0, %ymm0
+; X64-NEXT: retq
+ %cvt = fptosi <8 x float> %a0 to <8 x i32>
+ %res = bitcast <8 x i32> %cvt to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_div_pd:
+; X32: # BB#0:
+; X32-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_div_pd:
+; X64: # BB#0:
+; X64-NEXT: vdivpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fdiv <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_div_ps:
+; X32: # BB#0:
+; X32-NEXT: vdivps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_div_ps:
+; X64: # BB#0:
+; X64-NEXT: vdivps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fdiv <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_dp_ps:
+; X32: # BB#0:
+; X32-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_dp_ps:
+; X64: # BB#0:
+; X64-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi8:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrb $15, %xmm0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi8:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrb $15, %xmm0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %ext = extractelement <32 x i8> %arg0, i32 31
+ %res = zext i8 %ext to i32
+ ret i32 %res
+}
+
+define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi16:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrw $3, %xmm0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi16:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrw $3, %xmm0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %ext = extractelement <16 x i16> %arg0, i32 11
+ %res = zext i16 %ext to i32
+ ret i32 %res
+}
+
+define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi32:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrd $1, %xmm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi32:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrd $1, %xmm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = extractelement <8 x i32> %arg0, i32 5
+ ret i32 %res
+}
+
+define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extract_epi64:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vpextrd $2, %xmm0, %eax
+; X32-NEXT: vpextrd $3, %xmm0, %edx
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extract_epi64:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpextrq $1, %xmm0, %rax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = extractelement <4 x i64> %a0, i32 3
+ ret i64 %res
+}
+
+define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_extractf128_pd:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extractf128_pd:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_extractf128_ps:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extractf128_ps:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extractf128_si256:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extractf128_si256:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
+ ret <2 x i64> %res
+}
+
+define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_floor_pd:
+; X32: # BB#0:
+; X32-NEXT: vroundpd $1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_floor_pd:
+; X64: # BB#0:
+; X64-NEXT: vroundpd $1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_floor_ps:
+; X32: # BB#0:
+; X32-NEXT: vroundps $1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_floor_ps:
+; X64: # BB#0:
+; X64-NEXT: vroundps $1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_hadd_pd:
+; X32: # BB#0:
+; X32-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_pd:
+; X64: # BB#0:
+; X64-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_hadd_ps:
+; X32: # BB#0:
+; X32-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_ps:
+; X64: # BB#0:
+; X64-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_hsub_pd:
+; X32: # BB#0:
+; X32-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_pd:
+; X64: # BB#0:
+; X64-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_hsub_ps:
+; X32: # BB#0:
+; X32-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_ps:
+; X64: # BB#0:
+; X64-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi16:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
+; X32-LABEL: test_mm256_insert_epi64:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm2
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insert_epi64:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_insertf128_pd:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insertf128_pd:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X64-NEXT: retq
+ %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_insertf128_ps:
+; X32: # BB#0:
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insertf128_ps:
+; X64: # BB#0:
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_insertf128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_insertf128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X64-NEXT: retq
+ %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm256_lddqu_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vlddqu (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_lddqu_si256:
+; X64: # BB#0:
+; X64-NEXT: vlddqu (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64>* %a0 to i8*
+ %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
+
+define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm256_load_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_load_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ %res = load <4 x double>, <4 x double>* %arg0, align 32
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm256_load_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_load_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ %res = load <8 x float>, <8 x float>* %arg0, align 32
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm256_load_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_load_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %ymm0
+; X64-NEXT: retq
+ %res = load <4 x i64>, <4 x i64>* %a0, align 32
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm256_loadu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ %res = load <4 x double>, <4 x double>* %arg0, align 1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm256_loadu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ %res = load <8 x float>, <8 x float>* %arg0, align 1
+ ret <8 x float> %res
+}
+
+define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm256_loadu_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: retq
+ %res = load <4 x i64>, <4 x i64>* %a0, align 1
+ ret <4 x i64> %res
+}
+
+define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
+; X32-LABEL: test_mm256_loadu2_m128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu2_m128:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
+ %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %arg1 = bitcast float* %a1 to <4 x float>*
+ %lo4 = load <4 x float>, <4 x float>* %arg1, align 1
+ %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm256_loadu2_m128d:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu2_m128d:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
+ %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %arg1 = bitcast double* %a1 to <2 x double>*
+ %lo2 = load <2 x double>, <2 x double>* %arg1, align 1
+ %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
+; X32-LABEL: test_mm256_loadu2_m128i:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_loadu2_m128i:
+; X64: # BB#0:
+; X64-NEXT: vmovups (%rsi), %xmm0
+; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to <2 x i64>*
+ %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
+ %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %arg1 = bitcast i64* %a1 to <2 x i64>*
+ %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1
+ %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x i64> %res
+}
+
+define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
+
+define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone
+
+define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
+
+define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone
+
+define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone
+
+define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to i8*
+ call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone
+
+define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone
+
+define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_max_pd:
+; X32: # BB#0:
+; X32-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_pd:
+; X64: # BB#0:
+; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_max_ps:
+; X32: # BB#0:
+; X32-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_ps:
+; X64: # BB#0:
+; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_min_pd:
+; X32: # BB#0:
+; X32-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_pd:
+; X64: # BB#0:
+; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_min_ps:
+; X32: # BB#0:
+; X32-NEXT: vminps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_ps:
+; X64: # BB#0:
+; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_movedup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movedup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x float> %res
+}
+
+define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_movemask_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovmskpd %ymm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movemask_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovmskpd %ymm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
+
+define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_movemask_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovmskps %ymm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movemask_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovmskps %ymm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_mul_pd:
+; X32: # BB#0:
+; X32-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_pd:
+; X64: # BB#0:
+; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fmul <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_mul_ps:
+; X32: # BB#0:
+; X32-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_ps:
+; X64: # BB#0:
+; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fmul <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_or_pd:
+; X32: # BB#0:
+; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_or_pd:
+; X64: # BB#0:
+; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %res = or <4 x i64> %1, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_or_ps:
+; X32: # BB#0:
+; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_or_ps:
+; X64: # BB#0:
+; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %res = or <8 x i32> %1, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x double> %res
+}
+
+define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
+; X32-LABEL: test2_mm_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test2_mm_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_permute2f128_pd:
+; X32: # BB#0:
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2f128_pd:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 44)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+; PR26667
+define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_permute2f128_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovaps %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2f128_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm1, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 50)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_permute2f128_si256:
+; X32: # BB#0:
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2f128_si256:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; X64-NEXT: retq
+ %1 = bitcast <4 x i64> %a0 to <8 x i32>
+ %2 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %1, <8 x i32> %2, i8 35)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+
+define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_permutevar_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permutevar_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
+
+define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_permutevar_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
+
+define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_permutevar_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permutevar_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
+
+define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_permutevar_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
+
+define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_rcp_ps:
+; X32: # BB#0:
+; X32-NEXT: vrcpps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_rcp_ps:
+; X64: # BB#0:
+; X64-NEXT: vrcpps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_round_pd:
+; X32: # BB#0:
+; X32-NEXT: vroundpd $4, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_round_pd:
+; X64: # BB#0:
+; X64-NEXT: vroundpd $4, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_round_ps:
+; X32: # BB#0:
+; X32-NEXT: vroundps $4, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_round_ps:
+; X64: # BB#0:
+; X64-NEXT: vroundps $4, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_rsqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: vrsqrtps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_rsqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: vrsqrtps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
+
+define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
+; X32-LABEL: test_mm256_set_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm1
+; X32-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: vmovd %ecx, %xmm1
+; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <32 x i8> undef, i8 %a31, i32 0
+ %res1 = insertelement <32 x i8> %res0, i8 %a30, i32 1
+ %res2 = insertelement <32 x i8> %res1, i8 %a29, i32 2
+ %res3 = insertelement <32 x i8> %res2, i8 %a28, i32 3
+ %res4 = insertelement <32 x i8> %res3, i8 %a27, i32 4
+ %res5 = insertelement <32 x i8> %res4, i8 %a26, i32 5
+ %res6 = insertelement <32 x i8> %res5, i8 %a25, i32 6
+ %res7 = insertelement <32 x i8> %res6, i8 %a24, i32 7
+ %res8 = insertelement <32 x i8> %res7, i8 %a23, i32 8
+ %res9 = insertelement <32 x i8> %res8, i8 %a22, i32 9
+ %res10 = insertelement <32 x i8> %res9, i8 %a21, i32 10
+ %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
+ %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
+ %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
+ %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
+ %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
+ %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
+ %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
+ %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
+ %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
+ %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
+ %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
+ %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
+ %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
+ %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
+ %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
+ %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
+ %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
+ %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
+ %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
+ %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
+ %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
+ %res = bitcast <32 x i8> %res31 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
+; X32-LABEL: test_mm256_set_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
+; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i16> undef, i16 %a15, i32 0
+ %res1 = insertelement <16 x i16> %res0, i16 %a14, i32 1
+ %res2 = insertelement <16 x i16> %res1, i16 %a13, i32 2
+ %res3 = insertelement <16 x i16> %res2, i16 %a12, i32 3
+ %res4 = insertelement <16 x i16> %res3, i16 %a11, i32 4
+ %res5 = insertelement <16 x i16> %res4, i16 %a10, i32 5
+ %res6 = insertelement <16 x i16> %res5, i16 %a9 , i32 6
+ %res7 = insertelement <16 x i16> %res6, i16 %a8 , i32 7
+ %res8 = insertelement <16 x i16> %res7, i16 %a7 , i32 8
+ %res9 = insertelement <16 x i16> %res8, i16 %a6 , i32 9
+ %res10 = insertelement <16 x i16> %res9, i16 %a5 , i32 10
+ %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
+ %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
+ %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
+ %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
+ %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
+ %res = bitcast <16 x i16> %res15 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+; X32-LABEL: test_mm256_set_epi32:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi32:
+; X64: # BB#0:
+; X64-NEXT: vmovd %ecx, %xmm0
+; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
+; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; X64-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1
+; X64-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
+ %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
+ %res = bitcast <8 x i32> %res7 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
+; X32-LABEL: test_mm256_set_epi64x:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_epi64x:
+; X64: # BB#0:
+; X64-NEXT: vmovq %rdi, %xmm0
+; X64-NEXT: vmovq %rsi, %xmm1
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT: vmovq %rdx, %xmm1
+; X64-NEXT: vmovq %rcx, %xmm2
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
+ ret <4 x i64> %res3
+}
+
+define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_set_m128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_m128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_set_m128d:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_m128d:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x float>
+ %arg1 = bitcast <2 x double> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_set_m128i:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_m128i:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x float>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
+; X32-LABEL: test_mm256_set_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x double> undef, double %a3, i32 0
+ %res1 = insertelement <4 x double> %res0, double %a2, i32 1
+ %res2 = insertelement <4 x double> %res1, double %a1, i32 2
+ %res3 = insertelement <4 x double> %res2, double %a0, i32 3
+ ret <4 x double> %res3
+}
+
+define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
+; X32-LABEL: test_mm256_set_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set_ps:
+; X64: # BB#0:
+; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x float> undef, float %a7, i32 0
+ %res1 = insertelement <8 x float> %res0, float %a6, i32 1
+ %res2 = insertelement <8 x float> %res1, float %a5, i32 2
+ %res3 = insertelement <8 x float> %res2, float %a4, i32 3
+ %res4 = insertelement <8 x float> %res3, float %a3, i32 4
+ %res5 = insertelement <8 x float> %res4, float %a2, i32 5
+ %res6 = insertelement <8 x float> %res5, float %a1, i32 6
+ %res7 = insertelement <8 x float> %res6, float %a0, i32 7
+ ret <8 x float> %res7
+}
+
+define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X32-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <32 x i8> undef, i8 %a0, i32 0
+ %res1 = insertelement <32 x i8> %res0, i8 %a0, i32 1
+ %res2 = insertelement <32 x i8> %res1, i8 %a0, i32 2
+ %res3 = insertelement <32 x i8> %res2, i8 %a0, i32 3
+ %res4 = insertelement <32 x i8> %res3, i8 %a0, i32 4
+ %res5 = insertelement <32 x i8> %res4, i8 %a0, i32 5
+ %res6 = insertelement <32 x i8> %res5, i8 %a0, i32 6
+ %res7 = insertelement <32 x i8> %res6, i8 %a0, i32 7
+ %res8 = insertelement <32 x i8> %res7, i8 %a0, i32 8
+ %res9 = insertelement <32 x i8> %res8, i8 %a0, i32 9
+ %res10 = insertelement <32 x i8> %res9, i8 %a0, i32 10
+ %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
+ %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
+ %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
+ %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
+ %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
+ %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
+ %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
+ %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
+ %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
+ %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
+ %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
+ %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
+ %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
+ %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
+ %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
+ %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
+ %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
+ %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
+ %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
+ %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
+ %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
+ %res = bitcast <32 x i8> %res31 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi16:
+; X64: # BB#0:
+; X64-NEXT: vmovd %edi, %xmm0
+; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0
+ %res1 = insertelement <16 x i16> %res0, i16 %a0, i32 1
+ %res2 = insertelement <16 x i16> %res1, i16 %a0, i32 2
+ %res3 = insertelement <16 x i16> %res2, i16 %a0, i32 3
+ %res4 = insertelement <16 x i16> %res3, i16 %a0, i32 4
+ %res5 = insertelement <16 x i16> %res4, i16 %a0, i32 5
+ %res6 = insertelement <16 x i16> %res5, i16 %a0, i32 6
+ %res7 = insertelement <16 x i16> %res6, i16 %a0, i32 7
+ %res8 = insertelement <16 x i16> %res7, i16 %a0, i32 8
+ %res9 = insertelement <16 x i16> %res8, i16 %a0, i32 9
+ %res10 = insertelement <16 x i16> %res9, i16 %a0, i32 10
+ %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
+ %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
+ %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
+ %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
+ %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
+ %res = bitcast <16 x i16> %res15 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi32:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi32:
+; X64: # BB#0:
+; X64-NEXT: vmovd %edi, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
+ %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
+ %res = bitcast <8 x i32> %res7 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
+; X32-LABEL: test_mm256_set1_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_epi64x:
+; X64: # BB#0:
+; X64-NEXT: vmovq %rdi, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
+ ret <4 x i64> %res3
+}
+
+define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
+; X32-LABEL: test_mm256_set1_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x double> undef, double %a0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %a0, i32 1
+ %res2 = insertelement <4 x double> %res1, double %a0, i32 2
+ %res3 = insertelement <4 x double> %res2, double %a0, i32 3
+ ret <4 x double> %res3
+}
+
+define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
+; X32-LABEL: test_mm256_set1_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_set1_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x float> undef, float %a0, i32 0
+ %res1 = insertelement <8 x float> %res0, float %a0, i32 1
+ %res2 = insertelement <8 x float> %res1, float %a0, i32 2
+ %res3 = insertelement <8 x float> %res2, float %a0, i32 3
+ %res4 = insertelement <8 x float> %res3, float %a0, i32 4
+ %res5 = insertelement <8 x float> %res4, float %a0, i32 5
+ %res6 = insertelement <8 x float> %res5, float %a0, i32 6
+ %res7 = insertelement <8 x float> %res6, float %a0, i32 7
+ ret <8 x float> %res7
+}
+
+define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
+; X32-LABEL: test_mm256_setr_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovd %ecx, %xmm1
+; X32-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movzbl %dil, %esi
+; X64-NEXT: vmovd %esi, %xmm1
+; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <32 x i8> undef, i8 %a0 , i32 0
+ %res1 = insertelement <32 x i8> %res0, i8 %a1 , i32 1
+ %res2 = insertelement <32 x i8> %res1, i8 %a2 , i32 2
+ %res3 = insertelement <32 x i8> %res2, i8 %a3 , i32 3
+ %res4 = insertelement <32 x i8> %res3, i8 %a4 , i32 4
+ %res5 = insertelement <32 x i8> %res4, i8 %a5 , i32 5
+ %res6 = insertelement <32 x i8> %res5, i8 %a6 , i32 6
+ %res7 = insertelement <32 x i8> %res6, i8 %a7 , i32 7
+ %res8 = insertelement <32 x i8> %res7, i8 %a8 , i32 8
+ %res9 = insertelement <32 x i8> %res8, i8 %a9 , i32 9
+ %res10 = insertelement <32 x i8> %res9, i8 %a10, i32 10
+ %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
+ %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
+ %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
+ %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
+ %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
+ %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
+ %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
+ %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
+ %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
+ %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
+ %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
+ %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
+ %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
+ %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
+ %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
+ %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
+ %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
+ %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
+ %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
+ %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
+ %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
+ %res = bitcast <32 x i8> %res31 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
+; X32-LABEL: test_mm256_setr_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vmovd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; X64-NEXT: vmovd %edi, %xmm1
+; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
+; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i16> undef, i16 %a0 , i32 0
+ %res1 = insertelement <16 x i16> %res0, i16 %a1 , i32 1
+ %res2 = insertelement <16 x i16> %res1, i16 %a2 , i32 2
+ %res3 = insertelement <16 x i16> %res2, i16 %a3 , i32 3
+ %res4 = insertelement <16 x i16> %res3, i16 %a4 , i32 4
+ %res5 = insertelement <16 x i16> %res4, i16 %a5 , i32 5
+ %res6 = insertelement <16 x i16> %res5, i16 %a6 , i32 6
+ %res7 = insertelement <16 x i16> %res6, i16 %a7 , i32 7
+ %res8 = insertelement <16 x i16> %res7, i16 %a8 , i32 8
+ %res9 = insertelement <16 x i16> %res8, i16 %a9 , i32 9
+ %res10 = insertelement <16 x i16> %res9, i16 %a10, i32 10
+ %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
+ %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
+ %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
+ %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
+ %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
+ %res = bitcast <16 x i16> %res15 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+; X32-LABEL: test_mm256_setr_epi32:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi32:
+; X64: # BB#0:
+; X64-NEXT: vmovd %r8d, %xmm0
+; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0
+; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; X64-NEXT: vmovd %edi, %xmm1
+; X64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
+; X64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
+; X64-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
+ %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
+ %res = bitcast <8 x i32> %res7 to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
+; X32-LABEL: test_mm256_setr_epi64x:
+; X32: # BB#0:
+; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_epi64x:
+; X64: # BB#0:
+; X64-NEXT: vmovq %rcx, %xmm0
+; X64-NEXT: vmovq %rdx, %xmm1
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT: vmovq %rsi, %xmm1
+; X64-NEXT: vmovq %rdi, %xmm2
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
+ ret <4 x i64> %res3
+}
+
+define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_setr_m128:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_m128:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_setr_m128d:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_m128d:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x float>
+ %arg1 = bitcast <2 x double> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_setr_m128i:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_m128i:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x float>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x float>
+ %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x float> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
+; X32-LABEL: test_mm256_setr_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x double> undef, double %a0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %a1, i32 1
+ %res2 = insertelement <4 x double> %res1, double %a2, i32 2
+ %res3 = insertelement <4 x double> %res2, double %a3, i32 3
+ ret <4 x double> %res3
+}
+
+define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
+; X32-LABEL: test_mm256_setr_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
+; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setr_ps:
+; X64: # BB#0:
+; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res0 = insertelement <8 x float> undef, float %a0, i32 0
+ %res1 = insertelement <8 x float> %res0, float %a1, i32 1
+ %res2 = insertelement <8 x float> %res1, float %a2, i32 2
+ %res3 = insertelement <8 x float> %res2, float %a3, i32 3
+ %res4 = insertelement <8 x float> %res3, float %a4, i32 4
+ %res5 = insertelement <8 x float> %res4, float %a5, i32 5
+ %res6 = insertelement <8 x float> %res5, float %a6, i32 6
+ %res7 = insertelement <8 x float> %res6, float %a7, i32 7
+ ret <8 x float> %res7
+}
+
+define <4 x double> @test_mm256_setzero_pd() nounwind {
+; X32-LABEL: test_mm256_setzero_pd:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setzero_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ ret <4 x double> zeroinitializer
+}
+
+define <8 x float> @test_mm256_setzero_ps() nounwind {
+; X32-LABEL: test_mm256_setzero_ps:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setzero_ps:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ ret <8 x float> zeroinitializer
+}
+
+define <4 x i64> @test_mm256_setzero_si256() nounwind {
+; X32-LABEL: test_mm256_setzero_si256:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_setzero_si256:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ ret <4 x i64> zeroinitializer
+}
+
+define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
+; X32-LABEL: test_mm256_sqrt_pd:
+; X32: # BB#0:
+; X32-NEXT: vsqrtpd %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sqrt_pd:
+; X64: # BB#0:
+; X64-NEXT: vsqrtpd %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_sqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: vsqrtps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: vsqrtps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_store_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_store_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ store <4 x double> %a1, <4 x double>* %arg0, align 32
+ ret void
+}
+
+define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_store_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_store_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ store <8 x float> %a1, <8 x float>* %arg0, align 32
+ ret void
+}
+
+define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_store_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_store_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ store <4 x i64> %a1, <4 x i64>* %a0, align 32
+ ret void
+}
+
+define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_storeu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ store <4 x double> %a1, <4 x double>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_storeu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ store <8 x float> %a1, <8 x float>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_storeu_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ store <4 x i64> %a1, <4 x i64>* %a0, align 1
+ ret void
+}
+
+define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
+; X32-LABEL: test_mm256_storeu2_m128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups %xmm0, (%ecx)
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu2_m128:
+; X64: # BB#0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vmovups %xmm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x float> %lo, <4 x float>* %arg0, align 1
+ %arg1 = bitcast float* %a1 to <4 x float>*
+ %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ store <4 x float> %hi, <4 x float>* %arg1, align 1
+ ret void
+}
+
+define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
+; X32-LABEL: test_mm256_storeu2_m128d:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups %xmm0, (%ecx)
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu2_m128d:
+; X64: # BB#0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vmovups %xmm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
+ store <2 x double> %lo, <2 x double>* %arg0, align 1
+ %arg1 = bitcast double* %a1 to <2 x double>*
+ %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
+ store <2 x double> %hi, <2 x double>* %arg1, align 1
+ ret void
+}
+
+define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
+; X32-LABEL: test_mm256_storeu2_m128i:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups %xmm0, (%ecx)
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_storeu2_m128i:
+; X64: # BB#0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vmovups %xmm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
+ %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
+ store <2 x i64> %lo, <2 x i64>* %arg0, align 1
+ %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>*
+ %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
+ store <2 x i64> %hi, <2 x i64>* %arg1, align 1
+ ret void
+}
+
+define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_stream_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <4 x double>*
+ store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
+ ret void
+}
+
+define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_stream_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <8 x float>*
+ store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
+ ret void
+}
+
+define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_stream_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
+ ret void
+}
+
+define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_pd:
+; X32: # BB#0:
+; X32-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_pd:
+; X64: # BB#0:
+; X64-NEXT: vsubpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fsub <4 x double> %a0, %a1
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_ps:
+; X32: # BB#0:
+; X32-NEXT: vsubps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_ps:
+; X64: # BB#0:
+; X64-NEXT: vsubps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = fsub <8 x float> %a0, %a1
+ ret <8 x float> %res
+}
+
+define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_testc_pd:
+; X32: # BB#0:
+; X32-NEXT: vtestpd %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testc_pd:
+; X64: # BB#0:
+; X64-NEXT: vtestpd %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_testc_pd:
+; X32: # BB#0:
+; X32-NEXT: vtestpd %ymm1, %ymm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testc_pd:
+; X64: # BB#0:
+; X64-NEXT: vtestpd %ymm1, %ymm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_testc_ps:
+; X32: # BB#0:
+; X32-NEXT: vtestps %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testc_ps:
+; X64: # BB#0:
+; X64-NEXT: vtestps %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_testc_ps:
+; X32: # BB#0:
+; X32-NEXT: vtestps %ymm1, %ymm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testc_ps:
+; X64: # BB#0:
+; X64-NEXT: vtestps %ymm1, %ymm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_testc_si256:
+; X32: # BB#0:
+; X32-NEXT: vptest %ymm1, %ymm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testc_si256:
+; X64: # BB#0:
+; X64-NEXT: vptest %ymm1, %ymm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_testnzc_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testnzc_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_testnzc_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %ymm1, %ymm0
+; X32-NEXT: seta %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testnzc_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %ymm1, %ymm0
+; X64-NEXT: seta %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_testnzc_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testnzc_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_testnzc_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %ymm1, %ymm0
+; X32-NEXT: seta %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testnzc_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %ymm1, %ymm0
+; X64-NEXT: seta %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_testnzc_si256:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vptest %ymm1, %ymm0
+; X32-NEXT: seta %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testnzc_si256:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vptest %ymm1, %ymm0
+; X64-NEXT: seta %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_testz_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testz_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_testz_pd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestpd %ymm1, %ymm0
+; X32-NEXT: sete %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testz_pd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestpd %ymm1, %ymm0
+; X64-NEXT: sete %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_testz_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testz_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_testz_ps:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vtestps %ymm1, %ymm0
+; X32-NEXT: sete %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testz_ps:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vtestps %ymm1, %ymm0
+; X64-NEXT: sete %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
+
+define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_testz_si256:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: vptest %ymm1, %ymm0
+; X32-NEXT: sete %al
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testz_si256:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: vptest %ymm1, %ymm0
+; X64-NEXT: sete %al
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <2 x double> @test_mm_undefined_pd() nounwind {
+; X32-LABEL: test_mm_undefined_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <2 x double> undef
+}
+
+define <4 x double> @test_mm256_undefined_pd() nounwind {
+; X32-LABEL: test_mm256_undefined_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_undefined_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <4 x double> undef
+}
+
+define <8 x float> @test_mm256_undefined_ps() nounwind {
+; X32-LABEL: test_mm256_undefined_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_undefined_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <8 x float> undef
+}
+
+define <4 x i64> @test_mm256_undefined_si256() nounwind {
+; X32-LABEL: test_mm256_undefined_si256:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_undefined_si256:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <4 x i64> undef
+}
+
+define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+
+define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+
+define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
+; X32-LABEL: test_mm256_xor_pd:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_xor_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <4 x double> %a0 to <4 x i64>
+ %2 = bitcast <4 x double> %a1 to <4 x i64>
+ %res = xor <4 x i64> %1, %2
+ %bc = bitcast <4 x i64> %res to <4 x double>
+ ret <4 x double> %bc
+}
+
+define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
+; X32-LABEL: test_mm256_xor_ps:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_xor_ps:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = bitcast <8 x float> %a0 to <8 x i32>
+ %2 = bitcast <8 x float> %a1 to <8 x i32>
+ %res = xor <8 x i32> %1, %2
+ %bc = bitcast <8 x i32> %res to <8 x float>
+ ret <8 x float> %bc
+}
+
+define void @test_mm256_zeroall() nounwind {
+; X32-LABEL: test_mm256_zeroall:
+; X32: # BB#0:
+; X32-NEXT: vzeroall
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_zeroall:
+; X64: # BB#0:
+; X64-NEXT: vzeroall
+; X64-NEXT: retq
+ call void @llvm.x86.avx.vzeroall()
+ ret void
+}
+declare void @llvm.x86.avx.vzeroall() nounwind readnone
+
+define void @test_mm256_zeroupper() nounwind {
+; X32-LABEL: test_mm256_zeroupper:
+; X32: # BB#0:
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_zeroupper:
+; X64: # BB#0:
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ call void @llvm.x86.avx.vzeroupper()
+ ret void
+}
+declare void @llvm.x86.avx.vzeroupper() nounwind readnone
+
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 4867869863e3..a7b4c6b285d8 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,26 +1,33 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx | FileCheck %s
-; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
+; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
-; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
-; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
-; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
ret <8 x i32> %res
}
@@ -29,34 +36,46 @@ define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1
; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
; not a vinsertf128 $1.
define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
-; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
-; We don't check any vextractf128 variant with immediate 0 because that's just a move.
+; We don't check any vextractf128 variant with immediate 0 because that's just a move.
define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
-; CHECK: vextractf128 $1, %ymm0, %xmm0
+; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
-; CHECK: vextractf128 $1, %ymm0, %xmm0
+; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
-; CHECK: vextractf128 $1, %ymm0, %xmm0
+; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
ret <4 x i32> %res
}
@@ -66,16 +85,21 @@ declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind read
; of a vextractf128 $0 which should be optimized away, so just check that it's
; not a vextractf128 of any kind.
define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
-; CHECK-NOT: vextractf128
+; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
ret <2 x double> %res
}
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_blend_pd_256:
-; CHECK: vblendpd
+; CHECK-LABEL: test_x86_avx_blend_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; CHECK-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -83,8 +107,10 @@ declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_blend_ps_256:
-; CHECK: vblendps
+; CHECK-LABEL: test_x86_avx_blend_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -92,8 +118,10 @@ declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) no
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_dp_ps_256:
-; CHECK: vdpps
+; CHECK-LABEL: test_x86_avx_dp_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -101,8 +129,10 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounw
define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_psll_dq:
-; CHECK: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-LABEL: test_x86_sse2_psll_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -110,8 +140,10 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrl_dq:
-; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-LABEL: test_x86_sse2_psrl_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -119,8 +151,10 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_blendpd:
-; CHECK: vblendpd
+; CHECK-LABEL: test_x86_sse41_blendpd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -128,8 +162,10 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nou
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_blendps:
-; CHECK: vblendps
+; CHECK-LABEL: test_x86_sse41_blendps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -137,8 +173,10 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse41_pblendw:
-; CHECK: vpblendw
+; CHECK-LABEL: test_x86_sse41_pblendw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -147,7 +185,7 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind rea
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbd:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
@@ -158,7 +196,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbq:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
@@ -169,7 +207,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbw:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
@@ -180,7 +218,7 @@ declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxdq:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
@@ -191,7 +229,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwd:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
@@ -202,10 +240,282 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwq:
-; CHECK: # BB#0:
+; CHECK: ## BB#0:
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: retl
+ %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtps2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
+
+
+define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
+
+
+define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
+; CHECK-NEXT: vmovdqu %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+
+define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
+ ; fadd operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovupd %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
+ call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
+
+
+define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
+; CHECK-LABEL: test_x86_sse_storeu_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vmovups %xmm0, (%eax)
+; CHECK-NEXT: retl
+ call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
+ ret void
+}
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+
+define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
+ ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_avx_storeu_dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
+
+define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_avx_storeu_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vmovupd %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+ call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
+
+
+define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
+; CHECK-LABEL: test_x86_avx_storeu_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vmovups %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
+
+
+define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
+; CHECK-NEXT: retl
+ %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind readnone
+
+
+define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0]
+; CHECK-NEXT: retl
+ %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_vpermil_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4]
+; CHECK-NEXT: retl
+ %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 206be2396cba..35763297d816 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,11 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx,aes,pclmul | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512vl,aes,pclmul | FileCheck %s --check-prefix=AVX512VL
define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdec:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesdec %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesdec:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesdec %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesdec:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesdec %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -13,10 +19,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdeclast:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesdeclast:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesdeclast:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -24,10 +35,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind read
define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesenc %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesenc:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesenc %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesenc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesenc %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -35,10 +51,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenclast:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesenclast %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesenclast:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesenclast %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesenclast:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesenclast %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -46,10 +67,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind read
define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aesimc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaesimc %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aesimc:
+; AVX: ## BB#0:
+; AVX-NEXT: vaesimc %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aesimc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaesimc %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -57,10 +83,15 @@ declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aeskeygenassist:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaeskeygenassist $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_aesni_aeskeygenassist:
+; AVX: ## BB#0:
+; AVX-NEXT: vaeskeygenassist $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_aesni_aeskeygenassist:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaeskeygenassist $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -68,10 +99,15 @@ declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readno
define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_add_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_add_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_add_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -79,10 +115,15 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cmp_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cmp_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cmp_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -90,10 +131,15 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cmp_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cmp_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cmp_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -101,12 +147,23 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comieq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comieq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comieq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -114,12 +171,19 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comige_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comige_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comige_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -127,12 +191,19 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comigt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comigt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comigt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -140,12 +211,19 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comile_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comile_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comile_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -153,12 +231,19 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comilt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comilt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomisd %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comilt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomisd %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -166,34 +251,39 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_comineq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_comineq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomisd %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_comineq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
-
-
define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtdq2ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtdq2ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtdq2ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -201,10 +291,15 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtpd2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtpd2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtpd2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -212,10 +307,15 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtpd2ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtpd2ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtpd2ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -223,32 +323,31 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtps2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtps2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtps2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtps2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtps2pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
-
-
define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtsd2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtsd2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtsd2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtsd2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtsd2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -256,33 +355,47 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cvtsd2ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtsd2ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtsd2ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtsi2sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
+; AVX-LABEL: test_x86_sse2_cvtsi2sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtsi2sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse2_cvtss2sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvtss2sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvtss2sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -290,10 +403,15 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttpd2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvttpd2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -301,10 +419,15 @@ declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttps2dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvttps2dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvttps2dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -312,10 +435,15 @@ declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvttsd2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttsd2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_cvttsd2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttsd2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_cvttsd2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttsd2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -323,10 +451,15 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_div_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdivsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_div_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_div_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -335,10 +468,15 @@ declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_max_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_max_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_max_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -346,10 +484,15 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_max_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_max_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_max_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -357,10 +500,15 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_min_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_min_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_min_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -368,10 +516,15 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_min_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_min_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_min_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -379,10 +532,15 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_movmsk_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskpd %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_movmsk_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskpd %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_movmsk_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskpd %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -392,10 +550,15 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_mul_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_mul_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_mul_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -403,10 +566,15 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind
define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_packssdw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_packssdw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_packssdw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -414,10 +582,15 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea
define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_packsswb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_packsswb_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_packsswb_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -425,10 +598,15 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_packuswb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_packuswb_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_packuswb_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -436,10 +614,15 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_padds_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_padds_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_padds_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -447,10 +630,15 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_padds_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_padds_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_padds_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -458,10 +646,15 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_paddus_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_paddus_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_paddus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -469,10 +662,15 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_paddus_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_paddus_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_paddus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -480,10 +678,15 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_pavg_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pavg_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pavg_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -491,10 +694,15 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pavg_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pavg_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pavg_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -502,10 +710,15 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmadd_wd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmadd_wd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmadd_wd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -513,10 +726,15 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmaxs_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmaxs_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmaxs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -524,10 +742,15 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmaxu_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmaxu_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmaxu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -535,10 +758,15 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmins_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmins_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmins_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -546,10 +774,15 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_pminu_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pminu_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pminu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -557,10 +790,15 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse2_pmovmskb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovmskb %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmovmskb_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmovmskb %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmovmskb_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmovmskb %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -568,10 +806,15 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmulh_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmulh_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmulh_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -579,10 +822,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmulhu_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmulhu_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmulhu_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -590,10 +838,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_pmulu_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pmulu_dq:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pmulu_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -601,10 +854,15 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_psad_bw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psad_bw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psad_bw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -612,10 +870,15 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_psll_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psll_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psll_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -623,10 +886,15 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse2_psll_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psll_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psll_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -634,10 +902,15 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psll_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psll_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psll_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -645,10 +918,15 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpslld $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pslli_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpslld $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pslli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -656,10 +934,15 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllq $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pslli_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllq $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pslli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -667,10 +950,15 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse2_pslli_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_pslli_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_pslli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -678,10 +966,15 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_psra_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psra_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psra_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -689,10 +982,15 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psra_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psra_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psra_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -700,10 +998,15 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrad $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrai_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrad $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrai_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -711,10 +1014,15 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrai_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsraw $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrai_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsraw $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrai_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -722,10 +1030,15 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_psrl_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrl_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrl_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -733,10 +1046,15 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse2_psrl_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrl_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrl_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -744,10 +1062,15 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psrl_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrl_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrl_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -755,10 +1078,15 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrld $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrli_d:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrld $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -766,10 +1094,15 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlq $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrli_q:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -777,10 +1110,15 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse2_psrli_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlw $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psrli_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psrli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -788,10 +1126,15 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubs_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubs_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubs_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -799,10 +1142,15 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubs_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubs_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -810,10 +1158,15 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubus_b:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubus_b:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -821,10 +1174,15 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse2_psubus_w:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_psubus_w:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_psubus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -832,10 +1190,15 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_sqrt_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtpd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_sqrt_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtpd %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_sqrt_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtpd %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -843,65 +1206,31 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_sqrt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_sqrt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_sqrt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse2_storel_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovlps %xmm0, (%eax)
-; CHECK-NEXT: retl
- call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
- ret void
-}
-declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
-
-
-define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
- ; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_sse2_storeu_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpaddb LCPI77_0, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, (%eax)
-; CHECK-NEXT: retl
- %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
-
-
-define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
- ; fadd operation forces the execution domain.
-; CHECK-LABEL: test_x86_sse2_storeu_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovupd %xmm0, (%eax)
-; CHECK-NEXT: retl
- %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
- call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
-
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_sub_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsubsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_sub_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_sub_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -909,12 +1238,23 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomieq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomieq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomieq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -922,12 +1262,19 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomige_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomige_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomige_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -935,12 +1282,19 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomigt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomigt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomigt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -948,12 +1302,19 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomile_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomile_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomile_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -961,12 +1322,19 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomilt_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomilt_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomisd %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomilt_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomisd %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -974,12 +1342,23 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_ucomineq_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse2_ucomineq_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomisd %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse2_ucomineq_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomisd %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -987,10 +1366,15 @@ declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind read
define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_addsub_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_addsub_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_addsub_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -998,10 +1382,15 @@ declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwi
define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_addsub_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_addsub_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_addsub_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1009,10 +1398,15 @@ declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind
define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_hadd_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hadd_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hadd_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1020,10 +1414,15 @@ declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_hadd_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hadd_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hadd_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1031,10 +1430,15 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind re
define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_hsub_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hsub_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hsub_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1042,10 +1446,15 @@ declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_hsub_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_hsub_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_hsub_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1053,11 +1462,17 @@ declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind re
define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
-; CHECK-LABEL: test_x86_sse3_ldu_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vlddqu (%eax), %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse3_ldu_dq:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vlddqu (%eax), %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse3_ldu_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vlddqu (%eax), %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1065,10 +1480,15 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-; CHECK-LABEL: test_x86_sse41_blendvpd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_blendvpd:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_blendvpd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1076,10 +1496,15 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-; CHECK-LABEL: test_x86_sse41_blendvps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_blendvps:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_blendvps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1087,10 +1512,15 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_dppd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_dppd:
+; AVX: ## BB#0:
+; AVX-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_dppd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1098,10 +1528,15 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_dpps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_dpps:
+; AVX: ## BB#0:
+; AVX-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_dpps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1109,11 +1544,16 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_insertps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[3]
-; CHECK-NEXT: retl
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+; AVX-LABEL: test_x86_sse41_insertps:
+; AVX: ## BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_insertps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
+; AVX512VL-NEXT: retl
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
@@ -1121,10 +1561,15 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse41_mpsadbw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_mpsadbw:
+; AVX: ## BB#0:
+; AVX-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_mpsadbw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1132,10 +1577,15 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_packusdw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_packusdw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_packusdw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1143,10 +1593,15 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno
define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse41_pblendvb:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pblendvb:
+; AVX: ## BB#0:
+; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pblendvb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1154,10 +1609,15 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_phminposuw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphminposuw %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_phminposuw:
+; AVX: ## BB#0:
+; AVX-NEXT: vphminposuw %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_phminposuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1165,10 +1625,15 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxsb:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxsb:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1176,10 +1641,15 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxsd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxsd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1187,10 +1657,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxud:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxud:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1198,10 +1673,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmaxuw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmaxuw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmaxuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -1209,10 +1689,15 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminsb:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminsb:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1220,10 +1705,15 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminsd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminsd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1231,10 +1721,15 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminud:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminud:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1242,87 +1737,31 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_sse41_pminuw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pminuw:
+; AVX: ## BB#0:
+; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pminuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
-define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxbd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxbq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: retl
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxbw:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: retl
- %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxdq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-NEXT: retl
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxwd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_pmovzxwq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NEXT: retl
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
-
-
define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_sse41_pmuldq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_pmuldq:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_pmuldq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -1330,12 +1769,19 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_ptestc:
+; AVX: ## BB#0:
+; AVX-NEXT: vptest %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_ptestc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vptest %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1343,12 +1789,19 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestnzc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_ptestnzc:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_ptestnzc:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1356,12 +1809,19 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestz:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_ptestz:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_ptestz:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1369,10 +1829,15 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse41_round_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundpd $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundpd $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1380,10 +1845,15 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse41_round_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundps $7, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $7, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundps $7, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1391,10 +1861,15 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_round_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_sd:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_sd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1402,10 +1877,15 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_round_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse41_round_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse41_round_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1413,13 +1893,21 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestri128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestri128:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestri128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1427,16 +1915,27 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestri128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovdqa (%eax), %xmm0
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, (%ecx), %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestri128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmovdqa (%eax), %xmm0
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestri $7, (%ecx), %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestri128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmovdqa64 (%eax), %xmm0
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestri $7, (%ecx), %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a2
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
@@ -1444,15 +1943,30 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
}
-define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestria128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestria128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: seta %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestria128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seta %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1460,59 +1974,113 @@ declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestric128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestric128:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestric128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrio128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: seto %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestrio128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: seto %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestrio128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seto %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestris128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: sets %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestris128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: sets %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestris128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sets %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestriz128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; AVX-LABEL: test_x86_sse42_pcmpestriz128:
+; AVX: ## BB#0:
+; AVX-NEXT: pushl %ebx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: xorl %ebx, %ebx
+; AVX-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX-NEXT: sete %bl
+; AVX-NEXT: movl %ebx, %eax
+; AVX-NEXT: popl %ebx
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestriz128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: pushl %ebx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: vpcmpestri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sete %bl
+; AVX512VL-NEXT: movl %ebx, %eax
+; AVX512VL-NEXT: popl %ebx
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1520,12 +2088,19 @@ declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrm128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestrm $7, %xmm1, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestrm128:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestrm $7, %xmm1, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestrm128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestrm $7, %xmm1, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1533,13 +2108,21 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrm128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: movl $7, %edx
-; CHECK-NEXT: vpcmpestrm $7, (%ecx), %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpestrm128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: movl $7, %edx
+; AVX-NEXT: vpcmpestrm $7, (%ecx), %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpestrm128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: movl $7, %edx
+; AVX512VL-NEXT: vpcmpestrm $7, (%ecx), %xmm0
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a2
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
@@ -1547,11 +2130,17 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2
define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistri128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistri128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistri128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1559,14 +2148,23 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistri128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: vmovdqa (%ecx), %xmm0
-; CHECK-NEXT: vpcmpistri $7, (%eax), %xmm0
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistri128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: vmovdqa (%ecx), %xmm0
+; AVX-NEXT: vpcmpistri $7, (%eax), %xmm0
+; AVX-NEXT: movl %ecx, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistri128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: vmovdqa64 (%ecx), %xmm0
+; AVX512VL-NEXT: vpcmpistri $7, (%eax), %xmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a1
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
@@ -1575,12 +2173,19 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistria128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistria128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistria128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1588,12 +2193,19 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistric128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistric128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistric128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1601,12 +2213,19 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrio128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: seto %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistrio128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: seto %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistrio128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: seto %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1614,12 +2233,19 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistris128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: sets %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistris128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: sets %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistris128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sets %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1627,12 +2253,19 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistriz128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistriz128:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistriz128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vpcmpistri $7, %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1640,10 +2273,15 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrm128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpistrm $7, %xmm1, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistrm128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpcmpistrm $7, %xmm1, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistrm128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpcmpistrm $7, %xmm1, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -1651,11 +2289,17 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrm128_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpcmpistrm $7, (%eax), %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse42_pcmpistrm128_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vpcmpistrm $7, (%eax), %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse42_pcmpistrm128_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpcmpistrm $7, (%eax), %xmm0
+; AVX512VL-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a1
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
@@ -1663,10 +2307,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1
define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_add_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_add_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_add_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1674,10 +2323,15 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_cmp_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cmp_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cmp_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1685,10 +2339,15 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_cmp_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cmp_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cmp_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1696,12 +2355,23 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comieq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comieq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comieq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1709,12 +2379,19 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comige_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comige_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comige_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1722,12 +2399,19 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comigt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comigt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comigt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1735,12 +2419,19 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comile_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comile_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comile_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1748,12 +2439,19 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comilt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comilt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comilt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vcomiss %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1761,12 +2459,23 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_comineq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcomiss %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_comineq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vcomiss %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_comineq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1774,11 +2483,17 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_cvtsi2ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $7, %eax
-; CHECK-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cvtsi2ss:
+; AVX: ## BB#0:
+; AVX-NEXT: movl $7, %eax
+; AVX-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cvtsi2ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl $7, %eax
+; AVX512VL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1786,10 +2501,15 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_cvtss2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtss2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cvtss2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtss2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cvtss2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtss2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1797,10 +2517,15 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_cvttss2si:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttss2si %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_cvttss2si:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvttss2si %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_cvttss2si:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvttss2si %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1808,10 +2533,15 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_div_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_div_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_div_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1819,11 +2549,17 @@ declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind read
define void @test_x86_sse_ldmxcsr(i8* %a0) {
-; CHECK-LABEL: test_x86_sse_ldmxcsr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vldmxcsr (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ldmxcsr:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vldmxcsr (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ldmxcsr:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vldmxcsr (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.sse.ldmxcsr(i8* %a0)
ret void
}
@@ -1832,10 +2568,15 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_max_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_max_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_max_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1843,10 +2584,15 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_max_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_max_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_max_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1854,10 +2600,15 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_min_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_min_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_min_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1865,10 +2616,15 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_min_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_min_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_min_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1876,10 +2632,15 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_movmsk_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskps %xmm0, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_movmsk_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskps %xmm0, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_movmsk_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskps %xmm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -1888,10 +2649,15 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_mul_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_mul_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_mul_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1899,10 +2665,15 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rcp_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcpps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rcp_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vrcpps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rcp_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrcp14ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1910,10 +2681,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rcp_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcpss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rcp_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rcp_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrcpss %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1921,10 +2697,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rsqrt_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrtps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rsqrt_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vrsqrtps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rsqrt_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrsqrt14ps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1932,10 +2713,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rsqrt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_rsqrt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_rsqrt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1943,10 +2729,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_sqrt_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtps %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_sqrt_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtps %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_sqrt_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtps %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1954,10 +2745,15 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_sqrt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_sqrt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_sqrt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -1965,34 +2761,33 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @test_x86_sse_stmxcsr(i8* %a0) {
-; CHECK-LABEL: test_x86_sse_stmxcsr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vstmxcsr (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_stmxcsr:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vstmxcsr (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_stmxcsr:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vstmxcsr (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.sse.stmxcsr(i8* %a0)
ret void
}
declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
-define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_storeu_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovups %xmm0, (%eax)
-; CHECK-NEXT: retl
- call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
- ret void
-}
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
-
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_sub_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_sub_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_sub_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -2000,12 +2795,23 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomieq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomieq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: setnp %al
+; AVX-NEXT: sete %cl
+; AVX-NEXT: andb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomieq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setnp %al
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: andb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2013,12 +2819,19 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomige_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setae %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomige_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomige_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2026,12 +2839,19 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomigt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomigt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomigt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2039,12 +2859,19 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomile_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setbe %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomile_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: setae %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomile_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm0, %xmm1
+; AVX512VL-NEXT: setae %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2052,12 +2879,19 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomilt_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomilt_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomilt_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vucomiss %xmm0, %xmm1
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2065,12 +2899,23 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_ucomineq_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_sse_ucomineq_ss:
+; AVX: ## BB#0:
+; AVX-NEXT: vucomiss %xmm1, %xmm0
+; AVX-NEXT: setp %al
+; AVX-NEXT: setne %cl
+; AVX-NEXT: orb %al, %cl
+; AVX-NEXT: movzbl %cl, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_sse_ucomineq_ss:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vucomiss %xmm1, %xmm0
+; AVX512VL-NEXT: setp %al
+; AVX512VL-NEXT: setne %cl
+; AVX512VL-NEXT: orb %al, %cl
+; AVX512VL-NEXT: movzbl %cl, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2078,10 +2923,15 @@ declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnon
define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_ssse3_pabs_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpabsb %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pabs_b_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpabsb %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pabs_b_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsb %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -2089,10 +2939,15 @@ declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_ssse3_pabs_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpabsd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pabs_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pabs_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsd %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2100,10 +2955,15 @@ declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_ssse3_pabs_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpabsw %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pabs_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpabsw %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pabs_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsw %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2111,10 +2971,15 @@ declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phadd_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phadd_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2122,10 +2987,15 @@ declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phadd_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phadd_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2133,10 +3003,15 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phadd_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phadd_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2144,10 +3019,15 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphsubd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phsub_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phsub_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2155,10 +3035,15 @@ declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phsub_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phsub_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2166,10 +3051,15 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vphsubw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_phsub_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_phsub_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2177,10 +3067,15 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2188,10 +3083,15 @@ declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind
define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pmul_hr_sw_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pmul_hr_sw_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2199,10 +3099,15 @@ declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind
define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_ssse3_pshuf_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_pshuf_b_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_pshuf_b_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -2210,10 +3115,15 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsignb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_psign_b_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsignb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_psign_b_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignb %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -2221,10 +3131,15 @@ declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsignd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_psign_d_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsignd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_psign_d_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -2232,10 +3147,15 @@ declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsignw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_ssse3_psign_w_128:
+; AVX: ## BB#0:
+; AVX-NEXT: vpsignw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_ssse3_psign_w_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignw %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -2243,10 +3163,15 @@ declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_addsub_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_addsub_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_addsub_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2254,10 +3179,15 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_addsub_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_addsub_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_addsub_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2265,10 +3195,15 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_blendv_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_blendv_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_blendv_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2276,10 +3211,15 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_blendv_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_blendv_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_blendv_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2287,10 +3227,15 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_cmp_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cmp_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cmp_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2298,50 +3243,91 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_cmp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cmp_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpordps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cmp_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpordps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1
-; CHECK-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
+; AVX: ## BB#0:
+; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpleps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpordps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpltps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpleps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpordps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%a2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0) ; <<8 x float>> [#uses=1]
%a3 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a2, i8 1) ; <<8 x float>> [#uses=1]
%a4 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a3, i8 2) ; <<8 x float>> [#uses=1]
@@ -2380,11 +3366,16 @@ declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounw
define <4 x float> @test_x86_avx_cvt_pd2_ps_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_pd2_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2psy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvt_pd2_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2psy %ymm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -2392,89 +3383,64 @@ declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
define <4 x i32> @test_x86_avx_cvt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_pd2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtpd2dqy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvt_pd2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtpd2dqy %ymm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
-define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
-
-
define <8 x i32> @test_x86_avx_cvt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvt_ps2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvt_ps2dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtps2dq %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvt_ps2dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtps2dq %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
-define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
-
-
define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtdq2_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_cvtdq2_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_cvtdq2_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
-define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT: retl
- %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
-
-
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_dp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_dp_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_dp_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2482,10 +3448,15 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_hadd_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hadd_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hadd_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2493,10 +3464,15 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_hadd_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hadd_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hadd_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2504,10 +3480,15 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_hsub_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hsub_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hsub_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2515,10 +3496,15 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_hsub_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_hsub_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_hsub_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2526,11 +3512,17 @@ declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind
define <32 x i8> @test_x86_avx_ldu_dq_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_ldu_dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vlddqu (%eax), %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ldu_dq_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vlddqu (%eax), %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ldu_dq_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vlddqu (%eax), %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -2538,11 +3530,17 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
define <2 x double> @test_x86_avx_maskload_pd(i8* %a0, <2 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -2550,11 +3548,17 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly
define <4 x double> @test_x86_avx_maskload_pd_256(i8* %a0, <4 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2562,11 +3566,17 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind read
define <4 x float> @test_x86_avx_maskload_ps(i8* %a0, <4 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -2574,11 +3584,17 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly
define <8 x float> @test_x86_avx_maskload_ps_256(i8* %a0, <8 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskload_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskload_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2586,11 +3602,17 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind reado
define void @test_x86_avx_maskstore_pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2)
ret void
}
@@ -2598,12 +3620,18 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
ret void
}
@@ -2611,11 +3639,17 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
define void @test_x86_avx_maskstore_ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2)
ret void
}
@@ -2623,12 +3657,18 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_maskstore_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_maskstore_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
ret void
}
@@ -2636,10 +3676,15 @@ declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwin
define <4 x double> @test_x86_avx_max_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_max_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_max_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_max_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2647,10 +3692,15 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_x86_avx_max_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_max_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_max_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_max_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2658,10 +3708,15 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_x86_avx_min_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_min_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_min_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_min_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2669,10 +3724,15 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_x86_avx_min_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_min_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_min_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_min_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vminps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2680,11 +3740,16 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_movmsk_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskpd %ymm0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_movmsk_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskpd %ymm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_movmsk_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskpd %ymm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2692,11 +3757,16 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_movmsk_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskps %ymm0, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_movmsk_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovmskps %ymm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_movmsk_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovmskps %ymm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2709,13 +3779,20 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_ptestc_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %ymm1, %ymm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ptestc_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vptest %ymm1, %ymm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ptestc_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vptest %ymm1, %ymm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2723,13 +3800,20 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_ptestnzc_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %ymm1, %ymm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ptestnzc_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %ymm1, %ymm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ptestnzc_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %ymm1, %ymm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2737,13 +3821,20 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_ptestz_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vptest %ymm1, %ymm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_ptestz_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %ymm1, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_ptestz_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vptest %ymm1, %ymm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -2751,10 +3842,15 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_rcp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcpps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_rcp_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vrcpps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2762,10 +3858,15 @@ declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_round_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundpd $7, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_round_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $7, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_round_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundpd $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2773,10 +3874,15 @@ declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind read
define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_round_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundps $7, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_round_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $7, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_round_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vroundps $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2784,10 +3890,15 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrtps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vrsqrtps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2795,10 +3906,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtpd %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtpd %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2806,73 +3922,33 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtps %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
- ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
- ; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_avx_storeu_dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
- ret void
-}
-declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
-
-
-define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
- ; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_avx_storeu_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovupd %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
- call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
- ret void
-}
-declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
-
-
-define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_storeu_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovups %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
- call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
- ret void
-}
-declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
-
-
define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_vbroadcastf128_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vbroadcastf128 (%eax), %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vbroadcastf128_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2880,11 +3956,17 @@ declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly
define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_vbroadcastf128_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vbroadcastf128 (%eax), %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vbroadcastf128_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2892,10 +3974,15 @@ declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vperm2f128_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vperm2f128_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -2903,10 +3990,15 @@ declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>,
define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vperm2f128_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vperm2f128_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -2914,65 +4006,31 @@ declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8
define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vperm2f128_si_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vperm2f128_si_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: retl
- %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
-
-
-define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
-; CHECK-NEXT: retl
- %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0]
-; CHECK-NEXT: retl
- %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermil_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4]
-; CHECK-NEXT: retl
- %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
-
-
define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -2980,38 +4038,59 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_pd_256_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilpd {{.*}}, %ymm0, %ymm0 ## ymm0 = ymm0[1,0,2,3]
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_pd_256_2:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, <4 x i32>* %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_ps_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpermilps (%eax), %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_ps_load:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vpermilps (%eax), %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpermilps (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%a2 = load <4 x i32>, <4 x i32>* %a1
%res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
@@ -3020,10 +4099,15 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vpermilvar_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vpermilvar_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermilps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -3031,12 +4115,19 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestpd %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestpd %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3044,13 +4135,20 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %ymm1, %ymm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestpd %ymm1, %ymm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestpd %ymm1, %ymm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3058,12 +4156,19 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestps %xmm1, %xmm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestps %xmm1, %xmm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3071,13 +4176,20 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestc_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestc_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: sbbl %eax, %eax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestc_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vtestps %ymm1, %ymm0
+; AVX512VL-NEXT: sbbl %eax, %eax
+; AVX512VL-NEXT: andl $1, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3085,12 +4197,19 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3098,13 +4217,20 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %ymm1, %ymm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %ymm1, %ymm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %ymm1, %ymm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3112,12 +4238,19 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %xmm1, %xmm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %xmm1, %xmm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %xmm1, %xmm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3125,13 +4258,20 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestnzc_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: seta %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %ymm1, %ymm0
+; AVX512VL-NEXT: seta %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3139,12 +4279,19 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3152,13 +4299,20 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestpd %ymm1, %ymm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_pd_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestpd %ymm1, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestpd %ymm1, %ymm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3166,12 +4320,19 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %xmm1, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %xmm1, %xmm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3179,13 +4340,20 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vtestz_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vtestz_ps_256:
+; AVX: ## BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vtestz_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: vtestps %ymm1, %ymm0
+; AVX512VL-NEXT: sete %al
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -3193,11 +4361,15 @@ declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readn
define void @test_x86_avx_vzeroall() {
-; CHECK-LABEL: test_x86_avx_vzeroall:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vzeroall
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vzeroall:
+; AVX: ## BB#0:
+; AVX-NEXT: vzeroall
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vzeroall:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vzeroall
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.vzeroall()
ret void
}
@@ -3205,11 +4377,15 @@ declare void @llvm.x86.avx.vzeroall() nounwind
define void @test_x86_avx_vzeroupper() {
-; CHECK-LABEL: test_x86_avx_vzeroupper:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_avx_vzeroupper:
+; AVX: ## BB#0:
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx_vzeroupper:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx.vzeroupper()
ret void
}
@@ -3218,113 +4394,175 @@ declare void @llvm.x86.avx.vzeroupper() nounwind
; Make sure instructions with no AVX equivalents, but are associated with SSEX feature flags still work
define void @monitor(i8* %P, i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: monitor:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: leal (%eax), %eax
-; CHECK-NEXT: monitor
-; CHECK-NEXT: retl
+; AVX-LABEL: monitor:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: leal (%eax), %eax
+; AVX-NEXT: monitor
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: monitor:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: leal (%eax), %eax
+; AVX512VL-NEXT: monitor
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse3.monitor(i8* %P, i32 %E, i32 %H)
ret void
}
declare void @llvm.x86.sse3.monitor(i8*, i32, i32) nounwind
define void @mwait(i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: mwait:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: mwait
-; CHECK-NEXT: retl
+; AVX-LABEL: mwait:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: mwait
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: mwait:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: mwait
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse3.mwait(i32 %E, i32 %H)
ret void
}
declare void @llvm.x86.sse3.mwait(i32, i32) nounwind
define void @sfence() nounwind {
-; CHECK-LABEL: sfence:
-; CHECK: ## BB#0:
-; CHECK-NEXT: sfence
-; CHECK-NEXT: retl
+; AVX-LABEL: sfence:
+; AVX: ## BB#0:
+; AVX-NEXT: sfence
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: sfence:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: sfence
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse.sfence()
ret void
}
declare void @llvm.x86.sse.sfence() nounwind
define void @lfence() nounwind {
-; CHECK-LABEL: lfence:
-; CHECK: ## BB#0:
-; CHECK-NEXT: lfence
-; CHECK-NEXT: retl
+; AVX-LABEL: lfence:
+; AVX: ## BB#0:
+; AVX-NEXT: lfence
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: lfence:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: lfence
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse2.lfence()
ret void
}
declare void @llvm.x86.sse2.lfence() nounwind
define void @mfence() nounwind {
-; CHECK-LABEL: mfence:
-; CHECK: ## BB#0:
-; CHECK-NEXT: mfence
-; CHECK-NEXT: retl
+; AVX-LABEL: mfence:
+; AVX: ## BB#0:
+; AVX-NEXT: mfence
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: mfence:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: mfence
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse2.mfence()
ret void
}
declare void @llvm.x86.sse2.mfence() nounwind
define void @clflush(i8* %p) nounwind {
-; CHECK-LABEL: clflush:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: clflush (%eax)
-; CHECK-NEXT: retl
+; AVX-LABEL: clflush:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: clflush (%eax)
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: clflush:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: clflush (%eax)
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.sse2.clflush(i8* %p)
ret void
}
declare void @llvm.x86.sse2.clflush(i8*) nounwind
define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
-; CHECK-LABEL: crc32_32_8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: crc32b {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: crc32_32_8:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: crc32b {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: crc32_32_8:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: crc32b {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: retl
%tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
ret i32 %tmp
}
declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
-; CHECK-LABEL: crc32_32_16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: crc32w {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: crc32_32_16:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: crc32w {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: crc32_32_16:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: crc32w {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: retl
%tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
ret i32 %tmp
}
declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: crc32_32_32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: crc32l {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: retl
+; AVX-LABEL: crc32_32_32:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: crc32_32_32:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: retl
%tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
ret i32 %tmp
}
declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
-; CHECK-LABEL: movnt_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpaddq LCPI277_0, %xmm0, %xmm0
-; CHECK-NEXT: vmovntdq %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: movnt_dq:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: movnt_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
+; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a3) nounwind
@@ -3333,12 +4571,18 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
-; CHECK-LABEL: movnt_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovntps %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: movnt_ps:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vmovntps %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: movnt_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmovntps %ymm0, (%eax)
+; AVX512VL-NEXT: retl
tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
ret void
}
@@ -3346,14 +4590,22 @@ declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
; add operation forces the execution domain.
-; CHECK-LABEL: movnt_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovntpd %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; AVX-LABEL: movnt_pd:
+; AVX: ## BB#0:
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovntpd %ymm0, (%eax)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: movnt_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovntpd %ymm0, (%eax)
+; AVX512VL-NEXT: retl
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
ret void
@@ -3363,10 +4615,15 @@ declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
; Check for pclmulqdq
define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_pclmulqdq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retl
+; AVX-LABEL: test_x86_pclmulqdq:
+; AVX: ## BB#0:
+; AVX-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_pclmulqdq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86_64.ll b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
index 5a466fc3250f..252574d84d8f 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx512vl | FileCheck %s
define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
; CHECK: vcvtsd2si
diff --git a/test/CodeGen/X86/avx-isa-check.ll b/test/CodeGen/X86/avx-isa-check.ll
index 77bfbd4bb423..dffc8078e44f 100644
--- a/test/CodeGen/X86/avx-isa-check.ll
+++ b/test/CodeGen/X86/avx-isa-check.ll
@@ -1,5 +1,6 @@
; check AVX2 instructions that are disabled in case avx512VL/avx512BW present
-
+
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=corei7-avx -o /dev/null
; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=core-avx2 -mattr=+avx2 -o /dev/null
; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -o /dev/null
; RUN: llc < %s -mtriple=x86_64-apple-darwin -show-mc-encoding -mcpu=knl -mattr=+avx512vl -o /dev/null
@@ -568,3 +569,114 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %a) {
%shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <8 x i16> %shift
}
+
+define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
+entry:
+ %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %C = zext <8 x i8> %B to <8 x i16>
+ ret <8 x i16> %C
+}
+
+define <32 x i8> @_broadcast32xi8(i8 %a) {
+ %b = insertelement <32 x i8> undef, i8 %a, i32 0
+ %c = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
+ ret <32 x i8> %c
+}
+
+define <16 x i8> @_broadcast16xi8(i8 %a) {
+ %b = insertelement <16 x i8> undef, i8 %a, i32 0
+ %c = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %c
+}
+
+define <16 x i16> @_broadcast16xi16(i16 %a) {
+ %b = insertelement <16 x i16> undef, i16 %a, i32 0
+ %c = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
+ ret <16 x i16> %c
+}
+
+define <8 x i16> @_broadcast8xi16(i16 %a) {
+ %b = insertelement <8 x i16> undef, i16 %a, i32 0
+ %c = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %c
+}
+
+define <8 x i32> @_broadcast8xi32(i32 %a) {
+ %b = insertelement <8 x i32> undef, i32 %a, i32 0
+ %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
+ ret <8 x i32> %c
+}
+
+define <4 x i32> @_broadcast4xi32(i32 %a) {
+ %b = insertelement <4 x i32> undef, i32 %a, i32 0
+ %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %c
+}
+
+define <4 x i64> @_broadcast4xi64(i64 %a) {
+ %b = insertelement <4 x i64> undef, i64 %a, i64 0
+ %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %c
+}
+
+define <2 x i64> @_broadcast2xi64(i64 %a) {
+ %b = insertelement <2 x i64> undef, i64 %a, i64 0
+ %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %c
+}
+
+define <8 x float> @_broadcast8xfloat(float %a) {
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <4 x float> @_broadcast4xfloat(float %a) {
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+define <4 x double> @_broadcast4xdouble(double %a) {
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
+
+define <2 x double> @_broadcast2xdouble(double %a) {
+ %b = insertelement <2 x double> undef, double %a, i32 0
+ %c = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %c
+}
+
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = fmul <4 x float> %a0, %a1
+ %res = fsub <4 x float> %x, %a2
+ ret <4 x float> %res
+}
+
+define <32 x i8> @test_cmpgtb(<32 x i8> %A) {
+; generate the follow code
+; vpxor %ymm1, %ymm1, %ymm1
+; vpcmpgtb %ymm0, %ymm1, %ymm0
+ %B = ashr <32 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %B
+}
+
+define <4 x float> @_inreg4xfloat(float %a) {
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+define <8 x float> @_inreg8xfloat(float %a) {
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <4 x double> @_inreg4xdouble(double %a) {
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
diff --git a/test/CodeGen/X86/avx-select.ll b/test/CodeGen/X86/avx-select.ll
index 58a75ef0a25d..cdd3180d6245 100644
--- a/test/CodeGen/X86/avx-select.ll
+++ b/test/CodeGen/X86/avx-select.ll
@@ -1,19 +1,34 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; CHECK: _select00
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind {
+; CHECK-LABEL: select00:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB0_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %ymm0, %ymm1
+; CHECK-NEXT: LBB0_2:
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <8 x i32> zeroinitializer, <8 x i32> %b
%res = xor <8 x i32> %b, %selres
ret <8 x i32> %res
}
-; CHECK: _select01
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <4 x i64> @select01(i32 %a, <4 x i64> %b) nounwind {
+; CHECK-LABEL: select01:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB1_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %ymm0, %ymm1
+; CHECK-NEXT: LBB1_2:
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <4 x i64> zeroinitializer, <4 x i64> %b
%res = xor <4 x i64> %b, %selres
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index 033a95276608..b65412d99eb4 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -10,8 +10,7 @@ define <8 x i32> @vshift00(<8 x i32> %a) {
; CHECK-NEXT: vpslld $2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+ %s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %s
}
@@ -48,8 +47,7 @@ define <8 x i32> @vshift03(<8 x i32> %a) {
; CHECK-NEXT: vpsrld $2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+ %s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %s
}
@@ -86,8 +84,7 @@ define <8 x i32> @vshift06(<8 x i32> %a) {
; CHECK-NEXT: vpsrad $2, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32
-2>
+ %s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i32> %s
}
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index fae5b41abfa6..d0634ab59f56 100755
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -6,7 +6,7 @@ define <4 x i64> @test1(<4 x i64> %a) nounwind {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: retl
%b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
ret <4 x i64>%b
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index ebaaf0e8d00d..1914b5134bee 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
@@ -14,7 +15,8 @@ entry:
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -26,7 +28,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcC:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmovq %rdi, %xmm0
-; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -122,9 +124,8 @@ entry:
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcH:
; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -134,8 +135,7 @@ entry:
define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
; CHECK-LABEL: splat_load_2f64_11:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %ptr
%x1 = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 1>
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 27be9fd2fcd1..70c8ecb9d4ad 100755
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -1,22 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
-; CHECK-LABEL: trunc_64_32
-; CHECK: pshufd
-; CHECK: pshufd
-; CHECK: pblendw
+; CHECK-LABEL: trunc_64_32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%B = trunc <4 x i64> %A to <4 x i32>
ret <4 x i32>%B
}
+
define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
-; CHECK-LABEL: trunc_32_16
-; CHECK: pshufb
+; CHECK-LABEL: trunc_32_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%B = trunc <8 x i32> %A to <8 x i16>
ret <8 x i16>%B
}
+
define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
-; CHECK-LABEL: trunc_16_8
-; CHECK: pshufb
+; CHECK-LABEL: trunc_16_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%B = trunc <16 x i16> %A to <16 x i8>
ret <16 x i8> %B
}
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 0c92f4884fb7..b312be9aa6b2 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -173,14 +173,12 @@ define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtabl
; X32-LABEL: load_splat_8i32_4i32_33333333:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3]
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
@@ -277,16 +275,12 @@ define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable re
; X32-LABEL: load_splat_4i64_2i64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps (%eax), %xmm0
-; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, <2 x i64>* %ptr
@@ -315,14 +309,12 @@ define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwta
; X32-LABEL: load_splat_2f64_2f64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps (%eax), %xmm0
-; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
diff --git a/test/CodeGen/X86/avx-vbroadcastf128.ll b/test/CodeGen/X86/avx-vbroadcastf128.ll
new file mode 100644
index 000000000000..176246b093ec
--- /dev/null
+++ b/test/CodeGen/X86/avx-vbroadcastf128.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
+
+define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
+; X32-LABEL: test_broadcast_2f64_4f64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2f64_4f64:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x double>, <2 x double> *%p
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x double> %2
+}
+
+define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
+; X32-LABEL: test_broadcast_2i64_4i64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2i64_4i64:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64> *%p
+ %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %2
+}
+
+define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
+; X32-LABEL: test_broadcast_4f32_8f32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4f32_8f32:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x float>, <4 x float> *%p
+ %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %2
+}
+
+define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
+; X32-LABEL: test_broadcast_4i32_8i32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4i32_8i32:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32> *%p
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %2
+}
+
+define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
+; X32-LABEL: test_broadcast_8i16_16i16:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_8i16_16i16:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16> *%p
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {
+; X32-LABEL: test_broadcast_16i8_32i7:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_16i8_32i7:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8> *%p
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <32 x i8> %2
+}
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index 297fb250c5ff..2feddddaf780 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -1,28 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; CHECK-LABEL: A:
-; CHECK-NOT: vunpck
-; CHECK: vextractf128 $1
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: A:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8>
ret <8 x float> %shuffle
}
-; CHECK-LABEL: B:
-; CHECK-NOT: vunpck
-; CHECK: vextractf128 $1
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: B:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: retq
entry:
%shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
ret <4 x double> %shuffle
}
-; CHECK-LABEL: t0:
-; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t0:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
%1 = bitcast float* %addr to <4 x float>*
@@ -30,13 +34,12 @@ entry:
ret void
}
-declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
-
-; CHECK-LABEL: t2:
-; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t2:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
%1 = bitcast double* %addr to <2 x double>*
@@ -44,13 +47,12 @@ entry:
ret void
}
-declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-
-; CHECK-LABEL: t4:
-; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $1, %ymm0, (%rdi)
define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t4:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
%1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
@@ -59,11 +61,12 @@ entry:
ret void
}
-declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
-
-; CHECK-LABEL: t5:
-; CHECK: vmovaps %xmm0, (%rdi)
define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t5:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
%1 = bitcast float* %addr to <4 x float>*
@@ -71,9 +74,12 @@ entry:
ret void
}
-; CHECK-LABEL: t6:
-; CHECK: vmovaps %xmm0, (%rdi)
define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t6:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
%1 = bitcast double* %addr to <2 x double>*
@@ -81,9 +87,12 @@ entry:
ret void
}
-; CHECK-LABEL: t7:
-; CHECK: vmovaps %xmm0, (%rdi)
define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t7:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
%1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
@@ -92,9 +101,12 @@ entry:
ret void
}
-; CHECK-LABEL: t8:
-; CHECK: vmovups %xmm0, (%rdi)
define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK-LABEL: t8:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
entry:
%0 = bitcast <4 x i64> %a to <8 x i32>
%1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
@@ -105,6 +117,12 @@ entry:
; PR15462
define void @t9(i64* %p) {
+; CHECK-LABEL: t9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vmovups %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
store i64 0, i64* %p
%q = getelementptr i64, i64* %p, i64 1
store i64 0, i64* %q
@@ -113,9 +131,8 @@ define void @t9(i64* %p) {
%s = getelementptr i64, i64* %p, i64 3
store i64 0, i64* %s
ret void
-
-; CHECK-LABEL: t9:
-; CHECK: vxorps %xmm
-; CHECK-NOT: vextractf
-; CHECK: vmovups
}
+
+declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
+declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 0958008d9a3e..740fd77d82e2 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: A:
+define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45670123:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; ALL-NEXT: retq
@@ -12,28 +12,63 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: B:
+define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45670123_mem:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
; ALL-NEXT: retq
entry:
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+ %a = load <8 x float>, <8 x float>* %pa
+ %b = load <8 x float>, <8 x float>* %pb
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %shuffle
}
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: C:
+define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_0123cdef:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v8f32_01230123:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_01230123:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v8f32_01230123_mem:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_01230123_mem:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX2-NEXT: retq
+entry:
+ %a = load <8 x float>, <8 x float>* %pa
+ %b = load <8 x float>, <8 x float>* %pb
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %shuffle
}
-define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: D:
+define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45674567:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -42,28 +77,30 @@ entry:
ret <8 x float> %shuffle
}
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: E:
+define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_45674567_mem:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
; ALL-NEXT: retq
entry:
- %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
- ret <32 x i8> %shuffle
+ %a = load <8 x float>, <8 x float>* %pa
+ %b = load <8 x float>, <8 x float>* %pb
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %shuffle
}
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: E2:
+define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v32i8_2323:
; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
entry:
- %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
- ret <4 x i64> %shuffle
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i8> %shuffle
}
-define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: Ei:
+define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v32i8_2323_domain:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
@@ -71,7 +108,7 @@ define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: Ei:
+; AVX2-LABEL: shuffle_v32i8_2323_domain:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
@@ -83,14 +120,24 @@ entry:
ret <32 x i8> %shuffle
}
-define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E2i:
+define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v4i64_6701:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; ALL-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v4i64_6701_domain:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E2i:
+; AVX2-LABEL: shuffle_v4i64_6701_domain:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -103,8 +150,8 @@ entry:
ret <4 x i64> %shuffle
}
-define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E3i:
+define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
@@ -112,7 +159,7 @@ define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E3i:
+; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
@@ -125,14 +172,14 @@ entry:
ret <8 x i32> %shuffle
}
-define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E4i:
+define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v16i16_4501:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E4i:
+; AVX2-LABEL: shuffle_v16i16_4501:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -144,8 +191,8 @@ entry:
ret <16 x i16> %shuffle
}
-define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
-; AVX1-LABEL: E5i:
+define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: shuffle_v16i16_4501_mem:
; AVX1: ## BB#0: ## %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovaps (%rsi), %ymm1
@@ -153,7 +200,7 @@ define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: E5i:
+; AVX2-LABEL: shuffle_v16i16_4501_mem:
; AVX2: ## BB#0: ## %entry
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
@@ -170,8 +217,8 @@ entry:
;;;; Cases with undef indicies mixed in the mask
-define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F:
+define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67u9ub:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
@@ -180,8 +227,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F2(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F2:
+define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67uu67:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -190,8 +237,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F3(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F3:
+define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67uuab:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
@@ -200,8 +247,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F4(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F4:
+define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67uuef:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: retq
@@ -210,8 +257,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F5(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F5:
+define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu674567:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -220,8 +267,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F6(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F6:
+define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu6789ab:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
@@ -230,8 +277,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F7(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F7:
+define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_4567uu67:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; ALL-NEXT: retq
@@ -240,8 +287,8 @@ entry:
ret <8 x float> %shuffle
}
-define <8 x float> @F8(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: F8:
+define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_4567uuef:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: retq
@@ -252,8 +299,8 @@ entry:
;;;; Cases we must not select vperm2f128
-define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: G:
+define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: shuffle_v8f32_uu67ucuf:
; ALL: ## BB#0: ## %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
@@ -266,9 +313,18 @@ entry:
;; Test zero mask generation.
;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
+;; TODO: When building for optsize we should use vperm2f128.
-define <4 x double> @vperm2z_0x08(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x08:
+define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz01:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz01_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
@@ -276,8 +332,17 @@ define <4 x double> @vperm2z_0x08(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x18(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x18:
+define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz23:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz23_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -286,8 +351,16 @@ define <4 x double> @vperm2z_0x18(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x28(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x28:
+define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz45:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz45_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
@@ -295,8 +368,17 @@ define <4 x double> @vperm2z_0x28(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x38(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x38:
+define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_zz67:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_zz67_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
@@ -305,8 +387,17 @@ define <4 x double> @vperm2z_0x38(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x80(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x80:
+define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_01zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_01zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
@@ -315,8 +406,16 @@ define <4 x double> @vperm2z_0x80(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x81(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x81:
+define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_23zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_23zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
@@ -324,8 +423,17 @@ define <4 x double> @vperm2z_0x81(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x82(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x82:
+define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_45zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_45zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
@@ -334,8 +442,16 @@ define <4 x double> @vperm2z_0x82(<4 x double> %a) {
ret <4 x double> %s
}
-define <4 x double> @vperm2z_0x83(<4 x double> %a) {
-; ALL-LABEL: vperm2z_0x83:
+define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
+; ALL-LABEL: shuffle_v4f64_67zz:
+; ALL: ## BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; ALL-NEXT: retq
+ %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x double> %s
+}
+define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
+; ALL-LABEL: shuffle_v4f64_67zz_optsize:
; ALL: ## BB#0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
@@ -345,8 +461,8 @@ define <4 x double> @vperm2z_0x83(<4 x double> %a) {
;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
-define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: vperm2z_int_0x83:
+define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_67zz:
; AVX1: ## BB#0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -356,7 +472,7 @@ define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: vperm2z_int_0x83:
+; AVX2-LABEL: shuffle_v4i64_67zz:
; AVX2: ## BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -366,3 +482,174 @@ define <4 x i64> @vperm2z_int_0x83(<4 x i64> %a, <4 x i64> %b) {
ret <4 x i64> %c
}
+;;; Memory folding cases
+
+define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_4f64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_4f64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <4 x double>, <4 x double> * %pa
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
+ ret <4 x double> %res
+}
+
+define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_4f64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_4f64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <4 x double>, <4 x double> * %pb
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
+ ret <4 x double> %res
+}
+
+define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_8f32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_8f32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <8 x float>, <8 x float> * %pa
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+ ret <8 x float> %res
+}
+
+define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_8f32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_8f32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <8 x float>, <8 x float> * %pb
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+ ret <8 x float> %res
+}
+
+define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_4i64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_4i64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <4 x i64>, <4 x i64> * %pa
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_4i64:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_4i64:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <4 x i64>, <4 x i64> * %pb
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld0_hi0_lo1_8i32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld0_hi0_lo1_8i32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %a = load <8 x i32>, <8 x i32> * %pa
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
+; AVX1-LABEL: ld1_hi0_hi1_8i32:
+; AVX1: ## BB#0: ## %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ld1_hi0_hi1_8i32:
+; AVX2: ## BB#0: ## %entry
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b = load <8 x i32>, <8 x i32> * %pb
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i32> %res
+}
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index a16dc70e81c6..3c52aaf71adc 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,4 +1,9 @@
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
+
+; FASTYMM-NOT: vzeroupper
+; BTVER2-NOT: vzeroupper
declare i32 @foo()
declare <4 x float> @do_sse(<4 x float>)
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index 176292768253..7c16ec800a5e 100755
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -4,8 +4,9 @@
define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
; CHECK-LABEL: trunc4:
; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u>
-; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%B = trunc <4 x i64> %A to <4 x i32>
@@ -17,6 +18,7 @@ define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
; CHECK: ## BB#0:
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%B = trunc <8 x i32> %A to <8 x i16>
diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..430628c3f800
--- /dev/null
+++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -0,0 +1,3388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
+
+define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_abs_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpabsb %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_abs_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpabsb %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg = bitcast <4 x i64> %a0 to <32 x i8>
+ %call = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %arg)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_abs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpabsw %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_abs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpabsw %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg = bitcast <4 x i64> %a0 to <16 x i16>
+ %call = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %arg)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_abs_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpabsd %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_abs_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpabsd %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg = bitcast <4 x i64> %a0 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %arg)
+ %res = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = add <32 x i8> %arg0, %arg1
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = add <16 x i16> %arg0, %arg1
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = add <8 x i32> %arg0, %arg1
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_add_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_add_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = add <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_adds_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_adds_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_alignr_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_alignr_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test2_mm256_alignr_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
+; X32-NEXT: retl
+;
+; X64-LABEL: test2_mm256_alignr_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_and_si256:
+; X32: # BB#0:
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_and_si256:
+; X64: # BB#0:
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = and <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_andnot_si256:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; X32-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_andnot_si256:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %res = and <4 x i64> %not, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_avg_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_avg_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_avg_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_avg_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_blend_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_blend_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %res = bitcast <4 x i32> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_blend_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blend_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+ %res = bitcast <8 x i32> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_blendv_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_blendv_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
+ %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res = bitcast <16 x i8> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = bitcast <4 x i32> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = bitcast <8 x i32> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastsi128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsi128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
+; X32-LABEL: test_mm256_broadcastsi128_si256_mem:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
+; X64: # BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %a0 = load <2 x i64>, <2 x i64>* %p0
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %res
+}
+
+define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res = bitcast <8 x i16> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_bslli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_bslli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_bsrli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_bsrli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp eq <32 x i8> %arg0, %arg1
+ %res = sext <32 x i1> %cmp to <32 x i8>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp eq <16 x i16> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i16>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp eq <8 x i32> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i32>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpeq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpeq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp eq <4 x i64> %a0, %a1
+ %res = sext <4 x i1> %cmp to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp sgt <32 x i8> %arg0, %arg1
+ %res = sext <32 x i1> %cmp to <32 x i8>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp sgt <16 x i16> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i16>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp sgt <8 x i32> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i32>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_cmpgt_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmpgt_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp sgt <4 x i64> %a0, %a1
+ %res = sext <4 x i1> %cmp to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi8_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxbw %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi8_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext = sext <16 x i8> %arg0 to <16 x i16>
+ %res = bitcast <16 x i16> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi8_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxbd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi8_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxbd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %ext = sext <8 x i8> %shuf to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi8_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxbq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi8_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxbq %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = sext <4 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi16_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi16_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext = sext <8 x i16> %arg0 to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi16_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxwq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi16_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxwq %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = sext <4 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepi32_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovsxdq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepi32_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovsxdq %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = sext <4 x i32> %arg0 to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu8_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu8_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext = zext <16 x i8> %arg0 to <16 x i16>
+ %res = bitcast <16 x i16> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu8_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu8_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %ext = zext <8 x i8> %shuf to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu8_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu8_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = zext <4 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu16_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu16_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext = zext <8 x i16> %arg0 to <8 x i32>
+ %res = bitcast <8 x i32> %ext to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu16_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu16_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %ext = zext <4 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_cvtepu32_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtepu32_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = zext <4 x i32> %arg0 to <4 x i64>
+ ret <4 x i64> %ext
+}
+
+define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_extracti128_si256:
+; X32: # BB#0:
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_extracti128_si256:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hadd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hadd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hadds_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hadds_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hsub_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hsub_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsub_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_hsubs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_hsubs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovdqa %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
+ %bc = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
+ %bc = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
+; X32-NEXT: vmovdqa %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
+ ret <4 x i64> %res
+}
+
+define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
+ %sext = sext <2 x i1> %cmp to <2 x i64>
+ %mask = bitcast <2 x i64> %sext to <2 x double>
+ %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast double *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
+; X32-NEXT: vmovapd %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
+; X64-NEXT: vmovapd %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
+ %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
+
+define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast double *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
+ ret <4 x double> %res
+}
+
+define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %mask = bitcast <4 x i32> %sext to <4 x float>
+ %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
+ ret <4 x float> %call
+}
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm_mask_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
+ ret <4 x float> %call
+}
+
+define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
+; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovaps %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
+; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovaps %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
+ %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
+ ret <8 x float> %call
+}
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
+
+define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
+; X32-LABEL: test_mm256_mask_i32gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i32gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
+ ret <8 x float> %call
+}
+
+define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast i32 *%a0 to i8*
+ %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i32 *%a1 to i8*
+ %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
+ ret <2 x i64> %call
+}
+declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
+
+define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
+ ret <2 x i64> %call
+}
+
+define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovdqa %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64 *%a0 to i8*
+ %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
+ ret <4 x i64> %call
+}
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
+ ret <4 x i64> %call
+}
+
+define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
+ %sext = sext <2 x i1> %cmp to <2 x i64>
+ %mask = bitcast <2 x i64> %sext to <2 x double>
+ %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
+ ret <2 x double> %call
+}
+declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
+
+define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast double *%a1 to i8*
+ %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
+ ret <2 x double> %call
+}
+
+define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
+; X32-NEXT: vmovapd %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
+; X64-NEXT: vmovapd %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast double *%a0 to i8*
+ %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
+ %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
+ ret <4 x double> %call
+}
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_pd:
+; X64: # BB#0:
+; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast i64 *%a1 to i8*
+ %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
+ ret <4 x double> %call
+}
+
+define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT: vmovaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %mask = bitcast <4 x i32> %sext to <4 x float>
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
+ ret <4 x float> %call
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm_mask_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
+ ret <4 x float> %call
+}
+
+define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
+; X32-NEXT: vmovaps %xmm1, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float *%a0 to i8*
+ %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %mask = bitcast <4 x i32> %sext to <4 x float>
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
+ ret <4 x float> %call
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm256_mask_i64gather_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_i64gather_ps:
+; X64: # BB#0:
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg1 = bitcast float *%a1 to i8*
+ %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
+ ret <4 x float> %call
+}
+
+define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test0_mm256_inserti128_si256:
+; X32: # BB#0:
+; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test0_mm256_inserti128_si256:
+; X64: # BB#0:
+; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64-NEXT: retq
+ %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test1_mm256_inserti128_si256:
+; X32: # BB#0:
+; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test1_mm256_inserti128_si256:
+; X64: # BB#0:
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_madd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_madd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_maddubs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maddubs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %call to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
+
+define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i32* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
+
+define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_maskload_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskload_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
+
+define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_maskload_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskload_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
+
+define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
+
+define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_epi32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to i8*
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
+
+define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maskstore_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskstore_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
+
+define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
+; X32-LABEL: test_mm256_maskstore_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskstore_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to i8*
+ call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
+ ret void
+}
+declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp sgt <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp sgt <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp sgt <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp ugt <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp ugt <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_max_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_max_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp ugt <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp slt <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp slt <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp slt <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %cmp = icmp ult <32 x i8> %arg0, %arg1
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
+ %bc = bitcast <32 x i8> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %cmp = icmp ult <16 x i16> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
+ %bc = bitcast <16 x i16> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_min_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_min_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %cmp = icmp ult <8 x i32> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
+ %bc = bitcast <8 x i32> %sel to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_movemask_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpmovmskb %ymm0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movemask_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpmovmskb %ymm0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mpsadbw_epu8:
+; X32: # BB#0:
+; X32-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mpsadbw_epu8:
+; X64: # BB#0:
+; X64-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
+ %bc = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
+
+define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mul_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mul_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mul_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mulhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mulhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mulhi_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mulhi_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mulhrs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mulhrs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mullo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mullo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = mul <16 x i16> %arg0, %arg1
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_mullo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mullo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = mul <8 x i32> %arg0, %arg1
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_or_si256:
+; X32: # BB#0:
+; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_or_si256:
+; X64: # BB#0:
+; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = or <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packs_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packs_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packus_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packus_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_packus_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_packus_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_permute2x128_si256:
+; X32: # BB#0:
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2x128_si256:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 49)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_permute4x64_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute4x64_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
+ ret <4 x i64> %res
+}
+
+define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_permute4x64_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute4x64_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+ ret <4 x double> %res
+}
+
+define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_permutevar8x32_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar8x32_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_permutevar8x32_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutevar8x32_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
+
+define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sad_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sad_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
+ %res = bitcast <8 x i32> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_shuffle_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_shufflehi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shufflehi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_shufflelo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shufflelo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
+ %res = bitcast <16 x i16> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sign_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sign_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %res = bitcast <32 x i8> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sign_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sign_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %res = bitcast <16 x i16> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sign_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sign_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
+ %res = bitcast <8 x i32> %call to <4 x i64>
+ ret <4 x i64> %res
+}
+declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sll_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sll_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sll_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sll_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sll_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sll_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsllw $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsllw $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpslld $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpslld $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllq $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllq $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_slli_si256:
+; X32: # BB#0:
+; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_slli_si256:
+; X64: # BB#0:
+; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sllv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sllv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sllv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sllv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sllv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sllv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_sllv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sllv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sra_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sra_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_sra_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sra_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srai_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsraw $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srai_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsraw $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srai_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrad $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srai_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrad $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srav_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srav_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_srav_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srav_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_srl_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srl_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_srl_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srl_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_srl_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srl_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsrlw $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsrlw $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrld $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrld $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlq $3, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlq $3, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
+
+define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_srli_si256:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srli_si256:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+ %res = bitcast <32 x i8> %shuf to <4 x i64>
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srlv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srlv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_srlv_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srlv_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srlv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srlv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_srlv_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_srlv_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
+; X32-LABEL: test_mm256_stream_load_si256:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovntdqa (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_stream_load_si256:
+; X64: # BB#0:
+; X64-NEXT: vmovntdqa (%rdi), %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> *%a0 to i8*
+ %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
+
+define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = sub <32 x i8> %arg0, %arg1
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = sub <16 x i16> %arg0, %arg1
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = sub <8 x i32> %arg0, %arg1
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_sub_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_sub_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = sub <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_subs_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_subs_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
+ %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ %bc = bitcast <32 x i8> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
+ %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+ %bc = bitcast <16 x i16> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
+ %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %bc = bitcast <8 x i32> %res to <4 x i64>
+ ret <4 x i64> %bc
+}
+
+define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; X32-LABEL: test_mm256_xor_si256:
+; X32: # BB#0:
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_xor_si256:
+; X64: # BB#0:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = xor <4 x i64> %a0, %a1
+ ret <4 x i64> %res
+}
+
+declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index 36b6da5ef960..b6b8447beda1 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -1,7 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx2 | FileCheck %s
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpblendw
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; CHECK-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -9,7 +13,10 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpblendd
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -17,7 +24,10 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpblendd
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -25,7 +35,10 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind
define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vmpsadbw
+; CHECK-LABEL: test_x86_avx2_mpsadbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -33,7 +46,10 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind re
define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
- ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+; CHECK-LABEL: test_x86_avx2_psll_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -41,7 +57,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
- ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+; CHECK-LABEL: test_x86_avx2_psrl_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -49,7 +68,10 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
- ; CHECK: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; CHECK-LABEL: test_x86_avx2_psll_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -57,7 +79,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
- ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+; CHECK-LABEL: test_x86_avx2_psrl_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -65,9 +90,11 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx2_vextracti128:
-; CHECK: vextracti128
-
+; CHECK-LABEL: test_x86_avx2_vextracti128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7)
ret <2 x i64> %res
}
@@ -75,9 +102,10 @@ declare <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64>, i8) nounwind readnone
define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_vinserti128:
-; CHECK: vinserti128
-
+; CHECK-LABEL: test_x86_avx2_vinserti128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7)
ret <4 x i64> %res
}
@@ -85,10 +113,10 @@ declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind
define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
- ; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
- ; CHECK-NEXT: retl
+; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retl
%res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
ret <4 x double> %res
}
@@ -96,10 +124,10 @@ declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind
define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
- ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
- ; CHECK-NEXT: retl
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
ret <4 x float> %res
}
@@ -107,10 +135,10 @@ declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readon
define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
- ; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
- ; CHECK-NEXT: retl
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retl
%res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
ret <8 x float> %res
}
@@ -203,3 +231,284 @@ define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
+
+
+define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovsxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: retl
+ %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pmovzxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
+
+; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
+define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_avx_storeu_dq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: vpaddb LCPI33_0, %ymm0, %ymm0
+; CHECK-NEXT: vmovdqu %ymm0, (%eax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retl
+ %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
+define <32 x i8> @mm256_max_epi8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_max_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_max_epi16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_max_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_max_epi32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_max_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @mm256_max_epu8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_max_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_max_epu16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_max_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_max_epu32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_max_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @mm256_min_epi8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_min_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_min_epi16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_min_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_min_epi32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_min_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @mm256_min_epu8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_min_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_min_epu16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_min_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: mm256_min_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 606aca9dc02b..2a04de5fe907 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1,7 +1,17 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mattr=avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=AVX512VL
define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpackssdw
+; AVX2-LABEL: test_x86_avx2_packssdw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packssdw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -9,7 +19,15 @@ declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readno
define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpacksswb
+; AVX2-LABEL: test_x86_avx2_packsswb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packsswb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -17,7 +35,15 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpackuswb
+; AVX2-LABEL: test_x86_avx2_packuswb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packuswb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -25,7 +51,15 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpaddsb
+; AVX2-LABEL: test_x86_avx2_padds_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -33,7 +67,15 @@ declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpaddsw
+; AVX2-LABEL: test_x86_avx2_padds_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -41,7 +83,15 @@ declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpaddusb
+; AVX2-LABEL: test_x86_avx2_paddus_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_paddus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -49,7 +99,15 @@ declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnon
define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpaddusw
+; AVX2-LABEL: test_x86_avx2_paddus_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_paddus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -57,7 +115,15 @@ declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind read
define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpavgb
+; AVX2-LABEL: test_x86_avx2_pavg_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pavg_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -65,7 +131,15 @@ declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpavgw
+; AVX2-LABEL: test_x86_avx2_pavg_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pavg_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -73,7 +147,15 @@ declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readno
define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmaddwd
+; AVX2-LABEL: test_x86_avx2_pmadd_wd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -81,7 +163,15 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmaxs_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmaxsw
+; AVX2-LABEL: test_x86_avx2_pmaxs_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -89,7 +179,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pmaxu_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpmaxub
+; AVX2-LABEL: test_x86_avx2_pmaxu_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -97,7 +195,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmins_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpminsw
+; AVX2-LABEL: test_x86_avx2_pmins_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmins_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -105,7 +211,15 @@ declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpminub
+; AVX2-LABEL: test_x86_avx2_pminu_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminu_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -113,7 +227,16 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
- ; CHECK: vpmovmskb
+; AVX2-LABEL: test_x86_avx2_pmovmskb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmovmskb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmovmskb %ymm0, %eax
+; AVX512VL-NEXT: retl
%res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -121,7 +244,15 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmulhw
+; AVX2-LABEL: test_x86_avx2_pmulh_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulh_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -129,7 +260,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmulhuw
+; AVX2-LABEL: test_x86_avx2_pmulhu_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -137,7 +276,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmuludq
+; AVX2-LABEL: test_x86_avx2_pmulu_dq:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulu_dq:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -145,7 +292,15 @@ declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnon
define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsadbw
+; AVX2-LABEL: test_x86_avx2_psad_bw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psad_bw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -153,7 +308,15 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpslld
+; AVX2-LABEL: test_x86_avx2_psll_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -161,7 +324,15 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsllq
+; AVX2-LABEL: test_x86_avx2_psll_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -169,7 +340,15 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpsllw
+; AVX2-LABEL: test_x86_avx2_psll_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -177,7 +356,15 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) {
- ; CHECK: vpslld
+; AVX2-LABEL: test_x86_avx2_pslli_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpslld $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -185,7 +372,15 @@ declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) {
- ; CHECK: vpsllq
+; AVX2-LABEL: test_x86_avx2_pslli_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -193,7 +388,15 @@ declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) {
- ; CHECK: vpsllw
+; AVX2-LABEL: test_x86_avx2_pslli_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -201,7 +404,15 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsrad
+; AVX2-LABEL: test_x86_avx2_psra_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -209,7 +420,15 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpsraw
+; AVX2-LABEL: test_x86_avx2_psra_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -217,7 +436,15 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) {
- ; CHECK: vpsrad
+; AVX2-LABEL: test_x86_avx2_psrai_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrad $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -225,7 +452,15 @@ declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) {
- ; CHECK: vpsraw
+; AVX2-LABEL: test_x86_avx2_psrai_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsraw $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -233,7 +468,15 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsrld
+; AVX2-LABEL: test_x86_avx2_psrl_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -241,7 +484,15 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsrlq
+; AVX2-LABEL: test_x86_avx2_psrl_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -249,7 +500,15 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpsrlw
+; AVX2-LABEL: test_x86_avx2_psrl_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -257,7 +516,15 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
- ; CHECK: vpsrld
+; AVX2-LABEL: test_x86_avx2_psrli_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrld $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -265,7 +532,15 @@ declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) {
- ; CHECK: vpsrlq
+; AVX2-LABEL: test_x86_avx2_psrli_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -273,7 +548,15 @@ declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) {
- ; CHECK: vpsrlw
+; AVX2-LABEL: test_x86_avx2_psrli_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -281,7 +564,15 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsubsb
+; AVX2-LABEL: test_x86_avx2_psubs_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -289,7 +580,15 @@ declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpsubsw
+; AVX2-LABEL: test_x86_avx2_psubs_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -297,7 +596,15 @@ declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsubusb
+; AVX2-LABEL: test_x86_avx2_psubus_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubus_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -305,7 +612,15 @@ declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnon
define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpsubusw
+; AVX2-LABEL: test_x86_avx2_psubus_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psubus_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -313,7 +628,15 @@ declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind read
define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
- ; CHECK: vpabsb
+; AVX2-LABEL: test_x86_avx2_pabs_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpabsb %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pabs_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsb %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -321,7 +644,15 @@ declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) {
- ; CHECK: vpabsd
+; AVX2-LABEL: test_x86_avx2_pabs_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pabs_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsd %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -329,7 +660,15 @@ declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
- ; CHECK: vpabsw
+; AVX2-LABEL: test_x86_avx2_pabs_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpabsw %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pabs_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpabsw %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -337,7 +676,15 @@ declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vphaddd
+; AVX2-LABEL: test_x86_avx2_phadd_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phadd_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -345,7 +692,15 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphaddsw
+; AVX2-LABEL: test_x86_avx2_phadd_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phadd_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -353,7 +708,15 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphaddw
+; AVX2-LABEL: test_x86_avx2_phadd_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phadd_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -361,7 +724,15 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vphsubd
+; AVX2-LABEL: test_x86_avx2_phsub_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phsub_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -369,7 +740,15 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphsubsw
+; AVX2-LABEL: test_x86_avx2_phsub_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phsub_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -377,7 +756,15 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vphsubw
+; AVX2-LABEL: test_x86_avx2_phsub_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_phsub_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -385,7 +772,15 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpmaddubsw
+; AVX2-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -393,7 +788,15 @@ declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind rea
define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmulhrsw
+; AVX2-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -401,7 +804,15 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpshufb
+; AVX2-LABEL: test_x86_avx2_pshuf_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pshuf_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -409,7 +820,15 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpsignb
+; AVX2-LABEL: test_x86_avx2_psign_b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psign_b:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -417,7 +836,15 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsignd
+; AVX2-LABEL: test_x86_avx2_psign_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psign_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -425,7 +852,15 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpsignw
+; AVX2-LABEL: test_x86_avx2_psign_w:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psign_w:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -433,8 +868,17 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) {
- ; CHECK: movl
- ; CHECK: vmovntdqa
+; AVX2-LABEL: test_x86_avx2_movntdqa:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vmovntdqa (%eax), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_movntdqa:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vmovntdqa (%eax), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -442,7 +886,15 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vmpsadbw
+; AVX2-LABEL: test_x86_avx2_mpsadbw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_mpsadbw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -450,7 +902,15 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpackusdw
+; AVX2-LABEL: test_x86_avx2_packusdw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_packusdw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -458,7 +918,15 @@ declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readno
define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
- ; CHECK: vpblendvb
+; AVX2-LABEL: test_x86_avx2_pblendvb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendvb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -466,7 +934,15 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpblendw
+; AVX2-LABEL: test_x86_avx2_pblendw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -474,7 +950,15 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpmaxsb
+; AVX2-LABEL: test_x86_avx2_pmaxsb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -482,7 +966,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmaxsd
+; AVX2-LABEL: test_x86_avx2_pmaxsd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -490,7 +982,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmaxud
+; AVX2-LABEL: test_x86_avx2_pmaxud:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -498,7 +998,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpmaxuw
+; AVX2-LABEL: test_x86_avx2_pmaxuw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -506,7 +1014,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
- ; CHECK: vpminsb
+; AVX2-LABEL: test_x86_avx2_pminsb:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsb:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -514,7 +1030,15 @@ declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpminsd
+; AVX2-LABEL: test_x86_avx2_pminsd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -522,7 +1046,15 @@ declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpminud
+; AVX2-LABEL: test_x86_avx2_pminud:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminud:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -530,111 +1062,22 @@ declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
- ; CHECK: vpminuw
+; AVX2-LABEL: test_x86_avx2_pminuw:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pminuw:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
-define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
- ; CHECK: vpmovsxbd
- %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
- ; CHECK: vpmovsxbq
- %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
- ; CHECK: vpmovsxbw
- %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
- ; CHECK: vpmovsxdq
- %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
- ; CHECK: vpmovsxwd
- %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
- ; CHECK: vpmovsxwq
- %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
- ; CHECK: vpmovzxbd
- %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
- ; CHECK: vpmovzxbq
- %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
- ; CHECK: vpmovzxbw
- %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
- ; CHECK: vpmovzxdq
- %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
- ; CHECK: vpmovzxwd
- %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
- ; CHECK: vpmovzxwq
- %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
-
-
define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpmuldq
%res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -642,7 +1085,15 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpblendd
+; AVX2-LABEL: test_x86_avx2_pblendd_128:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendd_128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -650,29 +1101,53 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpblendd
+; AVX2-LABEL: test_x86_avx2_pblendd_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_pblendd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+; Check that the arguments are swapped between the intrinsic definition
+; and its lowering. Indeed, the offsets are the first source in
+; the instruction.
define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
- ; Check that the arguments are swapped between the intrinsic definition
- ; and its lowering. Indeed, the offsets are the first source in
- ; the instruction.
- ; CHECK: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-LABEL: test_x86_avx2_permd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_permd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+; Check that the arguments are swapped between the intrinsic definition
+; and its lowering. Indeed, the offsets are the first source in
+; the instruction.
define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
- ; Check that the arguments are swapped between the intrinsic definition
- ; and its lowering. Indeed, the offsets are the first source in
- ; the instruction.
- ; CHECK: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-LABEL: test_x86_avx2_permps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_permps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -680,7 +1155,15 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
- ; CHECK: vperm2i128
+; AVX2-LABEL: test_x86_avx2_vperm2i128:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_vperm2i128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -688,7 +1171,17 @@ declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind r
define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskload_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -696,7 +1189,17 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
define <4 x i64> @test_x86_avx2_maskload_q_256(i8* %a0, <4 x i64> %a1) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskload_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -704,7 +1207,17 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonl
define <4 x i32> @test_x86_avx2_maskload_d(i8* %a0, <4 x i32> %a1) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskload_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -712,7 +1225,17 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
define <8 x i32> @test_x86_avx2_maskload_d_256(i8* %a0, <8 x i32> %a1) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskload_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskload_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -720,7 +1243,17 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonl
define void @test_x86_avx2_maskstore_q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskstore_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2)
ret void
}
@@ -728,7 +1261,18 @@ declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
- ; CHECK: vpmaskmovq
+; AVX2-LABEL: test_x86_avx2_maskstore_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
ret void
}
@@ -736,7 +1280,17 @@ declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
define void @test_x86_avx2_maskstore_d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskstore_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2)
ret void
}
@@ -744,7 +1298,18 @@ declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
- ; CHECK: vpmaskmovd
+; AVX2-LABEL: test_x86_avx2_maskstore_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_maskstore_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
+; AVX512VL-NEXT: retl
call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
ret void
}
@@ -752,7 +1317,15 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsllvd
+; AVX2-LABEL: test_x86_avx2_psllv_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -760,7 +1333,15 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsllvd
+; AVX2-LABEL: test_x86_avx2_psllv_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -768,7 +1349,15 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsllvq
+; AVX2-LABEL: test_x86_avx2_psllv_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -776,7 +1365,15 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
- ; CHECK: vpsllvq
+; AVX2-LABEL: test_x86_avx2_psllv_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -784,7 +1381,15 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsrlvd
+; AVX2-LABEL: test_x86_avx2_psrlv_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -792,7 +1397,15 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsrlvd
+; AVX2-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -800,7 +1413,15 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpsrlvq
+; AVX2-LABEL: test_x86_avx2_psrlv_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -808,7 +1429,15 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
- ; CHECK: vpsrlvq
+; AVX2-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -816,33 +1445,79 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpsravd
+; AVX2-LABEL: test_x86_avx2_psrav_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
+define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
+; AVX2-LABEL: test_x86_avx2_psrav_d_const:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
+; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [2,9,4294967284,23]
+; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0
+; AVX512VL-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
- ; CHECK: vpsravd
+; AVX2-LABEL: test_x86_avx2_psrav_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
-; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
-define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
- ; CHECK: vmovdqu
- ; add operation forces the execution domain.
- %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
- ret void
+define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) {
+; AVX2-LABEL: test_x86_avx2_psrav_d_256_const:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0
+; AVX512VL-NEXT: retl
+ %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
+ ret <8 x i32> %res
}
-declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
-define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
- <4 x i32> %idx, <2 x double> %mask) {
- ; CHECK: vgatherdpd
+define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_pd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
ret <2 x double> %res
@@ -850,9 +1525,18 @@ define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
<4 x i32>, <2 x double>, i8) nounwind readonly
-define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
- <4 x i32> %idx, <4 x double> %mask) {
- ; CHECK: vgatherdpd
+define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1, <4 x i32> %idx, <4 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_pd_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
ret <4 x double> %res
@@ -860,9 +1544,18 @@ define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
<4 x i32>, <4 x double>, i8) nounwind readonly
-define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
- <2 x i64> %idx, <2 x double> %mask) {
- ; CHECK: vgatherqpd
+define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1, <2 x i64> %idx, <2 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_pd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_pd:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
ret <2 x double> %res
@@ -870,9 +1563,18 @@ define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
<2 x i64>, <2 x double>, i8) nounwind readonly
-define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
- <4 x i64> %idx, <4 x double> %mask) {
- ; CHECK: vgatherqpd
+define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1, <4 x i64> %idx, <4 x double> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_pd_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_pd_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
ret <4 x double> %res
@@ -880,9 +1582,18 @@ define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
<4 x i64>, <4 x double>, i8) nounwind readonly
-define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
- <4 x i32> %idx, <4 x float> %mask) {
- ; CHECK: vgatherdps
+define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1, <4 x i32> %idx, <4 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_ps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -890,9 +1601,18 @@ define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
<4 x i32>, <4 x float>, i8) nounwind readonly
-define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
- <8 x i32> %idx, <8 x float> %mask) {
- ; CHECK: vgatherdps
+define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1, <8 x i32> %idx, <8 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_ps_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
ret <8 x float> %res
@@ -900,9 +1620,18 @@ define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
<8 x i32>, <8 x float>, i8) nounwind readonly
-define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
- <2 x i64> %idx, <4 x float> %mask) {
- ; CHECK: vgatherqps
+define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1, <2 x i64> %idx, <4 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_ps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_ps:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -910,9 +1639,19 @@ define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
<2 x i64>, <4 x float>, i8) nounwind readonly
-define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
- <4 x i64> %idx, <4 x float> %mask) {
- ; CHECK: vgatherqps
+define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_ps_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_ps_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -920,9 +1659,18 @@ define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
<4 x i64>, <4 x float>, i8) nounwind readonly
-define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
- <4 x i32> %idx, <2 x i64> %mask) {
- ; CHECK: vpgatherdq
+define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1, <4 x i32> %idx, <2 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
ret <2 x i64> %res
@@ -930,9 +1678,18 @@ define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
<4 x i32>, <2 x i64>, i8) nounwind readonly
-define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
- <4 x i32> %idx, <4 x i64> %mask) {
- ; CHECK: vpgatherdq
+define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1, <4 x i32> %idx, <4 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
ret <4 x i64> %res
@@ -940,9 +1697,18 @@ define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
<4 x i32>, <4 x i64>, i8) nounwind readonly
-define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
- <2 x i64> %idx, <2 x i64> %mask) {
- ; CHECK: vpgatherqq
+define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1, <2 x i64> %idx, <2 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_q:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_q:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
ret <2 x i64> %res
@@ -950,9 +1716,18 @@ define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
<2 x i64>, <2 x i64>, i8) nounwind readonly
-define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
- <4 x i64> %idx, <4 x i64> %mask) {
- ; CHECK: vpgatherqq
+define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1, <4 x i64> %idx, <4 x i64> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_q_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_q_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
ret <4 x i64> %res
@@ -960,9 +1735,18 @@ define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
<4 x i64>, <4 x i64>, i8) nounwind readonly
-define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
- <4 x i32> %idx, <4 x i32> %mask) {
- ; CHECK: vpgatherdd
+define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1, <4 x i32> %idx, <4 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -970,9 +1754,18 @@ define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
<4 x i32>, <4 x i32>, i8) nounwind readonly
-define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
- <8 x i32> %idx, <8 x i32> %mask) {
- ; CHECK: vpgatherdd
+define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1, <8 x i32> %idx, <8 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_d_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_d_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
+; AVX512VL-NEXT: retl
%res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
ret <8 x i32> %res
@@ -980,9 +1773,18 @@ define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
<8 x i32>, <8 x i32>, i8) nounwind readonly
-define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
- <2 x i64> %idx, <4 x i32> %mask) {
- ; CHECK: vpgatherqd
+define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1, <2 x i64> %idx, <4 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_d:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -990,9 +1792,19 @@ define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
<2 x i64>, <4 x i32>, i8) nounwind readonly
-define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1,
- <4 x i64> %idx, <4 x i32> %mask) {
- ; CHECK: vpgatherqd
+define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) {
+; AVX2-LABEL: test_x86_avx2_gather_q_d_256:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_x86_avx2_gather_q_d_256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
+; AVX512VL-NEXT: retl
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -1001,13 +1813,25 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
<4 x i64>, <4 x i32>, i8) nounwind readonly
; PR13298
-define <8 x float> @test_gather_mask(<8 x float> %a0, float* %a,
- <8 x i32> %idx, <8 x float> %mask,
- float* nocapture %out) {
-; CHECK: test_gather_mask
-; CHECK: vmovaps %ymm2, [[DEST:%.*]]
-; CHECK: vgatherdps [[DEST]]
+define <8 x float> @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx, <8 x float> %mask, float* nocapture %out) {
;; gather with mask
+; AVX2-LABEL: test_gather_mask:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX2-NEXT: vmovaps %ymm2, %ymm3
+; AVX2-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0
+; AVX2-NEXT: vmovups %ymm2, (%eax)
+; AVX2-NEXT: retl
+;
+; AVX512VL-LABEL: test_gather_mask:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512VL-NEXT: vmovaps %ymm2, %ymm3
+; AVX512VL-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0
+; AVX512VL-NEXT: vmovups %ymm2, (%eax)
+; AVX512VL-NEXT: retl
%a_i8 = bitcast float* %a to i8*
%res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll
index 3d4fcec6078e..e187933f66be 100644
--- a/test/CodeGen/X86/avx2-logic.ll
+++ b/test/CodeGen/X86/avx2-logic.ll
@@ -53,35 +53,6 @@ define <32 x i8> @vpblendvb(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y) {
ret <32 x i8> %min
}
-define <8 x i32> @signd(<8 x i32> %a, <8 x i32> %b) nounwind {
-entry:
-; CHECK-LABEL: signd:
-; CHECK: psignd
-; CHECK-NOT: sub
-; CHECK: ret
- %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <8 x i32> zeroinitializer, %a
- %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <8 x i32> %a, %0
- %2 = and <8 x i32> %b.lobit, %sub
- %cond = or <8 x i32> %1, %2
- ret <8 x i32> %cond
-}
-
-define <8 x i32> @blendvb(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) nounwind {
-entry:
-; CHECK-LABEL: blendvb:
-; CHECK: pblendvb
-; CHECK: ret
- %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <8 x i32> zeroinitializer, %a
- %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <8 x i32> %c, %0
- %2 = and <8 x i32> %a, %b.lobit
- %cond = or <8 x i32> %1, %2
- ret <8 x i32> %cond
-}
-
define <8 x i32> @allOnes() nounwind {
; CHECK: vpcmpeqd
; CHECK-NOT: vinsert
diff --git a/test/CodeGen/X86/avx2-nontemporal.ll b/test/CodeGen/X86/avx2-nontemporal.ll
index 058358f13b86..55c966f6f884 100644
--- a/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/test/CodeGen/X86/avx2-nontemporal.ll
@@ -1,18 +1,69 @@
-; RUN: llc < %s -march=x86 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X64
-define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E) {
-; CHECK: vmovntps %y
+define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %F, <16 x i16> %G, <32 x i8> %H) nounwind {
+; X32-LABEL: f:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-32, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: vmovdqa 104(%ebp), %ymm3
+; X32-NEXT: vmovdqa 72(%ebp), %ymm4
+; X32-NEXT: vmovdqa 40(%ebp), %ymm5
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: vaddps .LCPI0_0, %ymm0, %ymm0
+; X32-NEXT: vmovntps %ymm0, (%eax)
+; X32-NEXT: vpaddq .LCPI0_1, %ymm2, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: vaddpd .LCPI0_2, %ymm1, %ymm0
+; X32-NEXT: vmovntpd %ymm0, (%eax)
+; X32-NEXT: vpaddd .LCPI0_3, %ymm5, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: vpaddw .LCPI0_4, %ymm4, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: vpaddb .LCPI0_5, %ymm3, %ymm0
+; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: f:
+; X64: # BB#0:
+; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vmovntps %ymm0, (%rdi)
+; X64-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vaddpd {{.*}}(%rip), %ymm1, %ymm0
+; X64-NEXT: vmovntpd %ymm0, (%rdi)
+; X64-NEXT: vpaddd {{.*}}(%rip), %ymm3, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vpaddw {{.*}}(%rip), %ymm4, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vpaddb {{.*}}(%rip), %ymm5, %ymm0
+; X64-NEXT: vmovntdq %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%cast = bitcast i8* %B to <8 x float>*
- %A2 = fadd <8 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
+ %A2 = fadd <8 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
-; CHECK: vmovntdq %y
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
-; CHECK: vmovntpd %y
%cast2 = bitcast i8* %B to <4 x double>*
- %C2 = fadd <4 x double> %C, <double 0x0, double 0x0, double 0x0, double 0x4200000000000000>
+ %C2 = fadd <4 x double> %C, <double 1.0, double 2.0, double 3.0, double 4.0>
store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
+ %cast3 = bitcast i8* %B to <8 x i32>*
+ %F2 = add <8 x i32> %F, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+ store <8 x i32> %F2, <8 x i32>* %cast3, align 32, !nontemporal !0
+ %cast4 = bitcast i8* %B to <16 x i16>*
+ %G2 = add <16 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+ store <16 x i16> %G2, <16 x i16>* %cast4, align 32, !nontemporal !0
+ %cast5 = bitcast i8* %B to <32 x i8>*
+ %H2 = add <32 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+ store <32 x i8> %H2, <32 x i8>* %cast5, align 32, !nontemporal !0
ret void
}
diff --git a/test/CodeGen/X86/avx2-phaddsub.ll b/test/CodeGen/X86/avx2-phaddsub.ll
index 3f9c95cfd070..88c70ad84fa0 100644
--- a/test/CodeGen/X86/avx2-phaddsub.ll
+++ b/test/CodeGen/X86/avx2-phaddsub.ll
@@ -1,71 +1,88 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
-; CHECK-LABEL: phaddw1:
-; CHECK: vphaddw
define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: phaddw1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
%b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
%r = add <16 x i16> %a, %b
ret <16 x i16> %r
}
-; CHECK-LABEL: phaddw2:
-; CHECK: vphaddw
define <16 x i16> @phaddw2(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: phaddw2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
%b = shufflevector <16 x i16> %y, <16 x i16> %x, <16 x i32> <i32 16, i32 18, i32 20, i32 22, i32 0, i32 2, i32 4, i32 6, i32 24, i32 26, i32 28, i32 30, i32 8, i32 10, i32 12, i32 14>
%r = add <16 x i16> %a, %b
ret <16 x i16> %r
}
-; CHECK-LABEL: phaddd1:
-; CHECK: vphaddd
define <8 x i32> @phaddd1(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phaddd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = add <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phaddd2:
-; CHECK: vphaddd
define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phaddd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
%b = shufflevector <8 x i32> %y, <8 x i32> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
%r = add <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phaddd3:
-; CHECK: vphaddd
define <8 x i32> @phaddd3(<8 x i32> %x) {
+; CHECK-LABEL: phaddd3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = add <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phsubw1:
-; CHECK: vphsubw
define <16 x i16> @phsubw1(<16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: phsubw1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
%b = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
%r = sub <16 x i16> %a, %b
ret <16 x i16> %r
}
-; CHECK-LABEL: phsubd1:
-; CHECK: vphsubd
define <8 x i32> @phsubd1(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phsubd1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = sub <8 x i32> %a, %b
ret <8 x i32> %r
}
-; CHECK-LABEL: phsubd2:
-; CHECK: vphsubd
define <8 x i32> @phsubd2(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: phsubd2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 undef, i32 8, i32 undef, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 undef, i32 9, i32 11, i32 5, i32 7, i32 undef, i32 15>
%r = sub <8 x i32> %a, %b
diff --git a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll
deleted file mode 100644
index 6bd6a5041d41..000000000000
--- a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s
-
-define <16 x i16> @test_lvm_x86_avx2_pmovsxbw(<16 x i8>* %a) {
-; CHECK-LABEL: test_lvm_x86_avx2_pmovsxbw
-; CHECK: vpmovsxbw (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %1)
- ret <16 x i16> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbd
-; CHECK: vpmovsxbd (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbq
-; CHECK: vpmovsxbq (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %1)
- ret <4 x i64> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwd
-; CHECK: vpmovsxwd (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwq
-; CHECK: vpmovsxwq (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %1)
- ret <4 x i64> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovsxdq
-; CHECK: vpmovsxdq (%rdi), %ymm0
- %1 = load <4 x i32>, <4 x i32>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %1)
- ret <4 x i64> %2
-}
-
-define <16 x i16> @test_lvm_x86_avx2_pmovzxbw(<16 x i8>* %a) {
-; CHECK-LABEL: test_lvm_x86_avx2_pmovzxbw
-; CHECK: vpmovzxbw (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %1)
- ret <16 x i16> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbd
-; CHECK: vpmovzxbd (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbq
-; CHECK: vpmovzxbq (%rdi), %ymm0
- %1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %1)
- ret <4 x i64> %2
-}
-
-define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwd
-; CHECK: vpmovzxwd (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %1)
- ret <8 x i32> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwq
-; CHECK: vpmovzxwq (%rdi), %ymm0
- %1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %1)
- ret <4 x i64> %2
-}
-
-define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) {
-; CHECK-LABEL: test_llvm_x86_avx2_pmovzxdq
-; CHECK: vpmovzxdq (%rdi), %ymm0
- %1 = load <4 x i32>, <4 x i32>* %a, align 1
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %1)
- ret <4 x i64> %2
-}
-
-declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>)
-declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>)
-declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>)
-declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>)
-declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>)
-declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>)
-declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>)
-declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>)
-declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>)
-declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>)
-declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>)
-declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>)
diff --git a/test/CodeGen/X86/avx2-pmovxrm.ll b/test/CodeGen/X86/avx2-pmovxrm.ll
new file mode 100644
index 000000000000..1d0626f66eea
--- /dev/null
+++ b/test/CodeGen/X86/avx2-pmovxrm.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define <16 x i16> @test_llvm_x86_avx2_pmovsxbw(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxbw:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxbw (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxbw:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxbw (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = sext <16 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxbd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxbd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxbd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxbd (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = sext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxbq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxbq (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxbq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxbq (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxwd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxwd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxwd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxwd (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = sext <8 x i16> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxwq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxwq (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxwq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxwq (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovsxdq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovsxdq (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovsxdq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxdq (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %a, align 1
+ %2 = sext <4 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <16 x i16> @test_llvm_x86_avx2_pmovzxbw(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxbw:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxbw:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = zext <16 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxbd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxbd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = zext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxbq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxbq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %a, align 1
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxwd:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxwd:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = zext <8 x i16> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxwq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxwq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a, align 1
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) {
+; X32-LABEL: test_llvm_x86_avx2_pmovzxdq:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_llvm_x86_avx2_pmovzxdq:
+; X64: ## BB#0:
+; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %a, align 1
+ %2 = zext <4 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 8fd50ae3015d..2ecf2fa5a6e7 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -442,8 +442,7 @@ define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable re
; X32-LABEL: load_splat_2i64_2i64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2i64_2i64_1111:
@@ -494,14 +493,12 @@ define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwta
; X32-LABEL: load_splat_2f64_2f64_1111:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovaps (%eax), %xmm0
-; X32-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
; X64: ## BB#0: ## %entry
-; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
%ld = load <2 x double>, <2 x double>* %ptr
@@ -643,7 +640,7 @@ define void @crash() nounwind alwaysinline {
; X32-NEXT: je LBB31_1
; X32-NEXT: ## BB#2: ## %ret
; X32-NEXT: retl
-; X32-NEXT: .align 4, 0x90
+; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: LBB31_1: ## %footer349VF
; X32-NEXT: ## =>This Inner Loop Header: Depth=1
; X32-NEXT: jmp LBB31_1
@@ -655,7 +652,7 @@ define void @crash() nounwind alwaysinline {
; X64-NEXT: je LBB31_1
; X64-NEXT: ## BB#2: ## %ret
; X64-NEXT: retq
-; X64-NEXT: .align 4, 0x90
+; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: LBB31_1: ## %footer349VF
; X64-NEXT: ## =>This Inner Loop Header: Depth=1
; X64-NEXT: jmp LBB31_1
diff --git a/test/CodeGen/X86/avx2-vbroadcasti128.ll b/test/CodeGen/X86/avx2-vbroadcasti128.ll
new file mode 100644
index 000000000000..2f11735af046
--- /dev/null
+++ b/test/CodeGen/X86/avx2-vbroadcasti128.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
+
+define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
+; X32-LABEL: test_broadcast_2f64_4f64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovapd (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vaddpd LCPI0_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2f64_4f64:
+; X64: ## BB#0:
+; X64-NEXT: vmovapd (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x double>, <2 x double> *%p
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0>
+ ret <4 x double> %3
+}
+
+define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
+; X32-LABEL: test_broadcast_2i64_4i64:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_2i64_4i64:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64> *%p
+ %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4>
+ ret <4 x i64> %3
+}
+
+define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
+; X32-LABEL: test_broadcast_4f32_8f32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps (%eax), %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vaddps LCPI2_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4f32_8f32:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x float>, <4 x float> *%p
+ %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
+ ret <8 x float> %3
+}
+
+define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
+; X32-LABEL: test_broadcast_4i32_8i32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddd LCPI3_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4i32_8i32:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32> *%p
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+ ret <8 x i32> %3
+}
+
+define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
+; X32-LABEL: test_broadcast_8i16_16i16:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddw LCPI4_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_8i16_16i16:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16> *%p
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16>
+ ret <16 x i16> %3
+}
+
+define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {
+; X32-LABEL: test_broadcast_16i8_32i7:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovdqa (%eax), %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpaddb LCPI5_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_16i8_32i7:
+; X64: ## BB#0:
+; X64-NEXT: vmovdqa (%rdi), %xmm0
+; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8> *%p
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32>
+ ret <32 x i8> %3
+}
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index b92b78035009..c9ab80bc5499 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -1,266 +1,266 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
; AVX2 Logical Shift Left
define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sllw_1:
-; CHECK-NOT: vpsllw $0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpaddw %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sllw_2:
-; CHECK: vpaddw %ymm0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sllw_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sllw_3:
-; CHECK: vpsllw $15, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
+; CHECK-LABEL: test_slld_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_slld_1:
-; CHECK-NOT: vpslld $0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
+; CHECK-LABEL: test_slld_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_slld_2:
-; CHECK: vpaddd %ymm0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_vpslld_var(i32 %shift) {
+; CHECK-LABEL: test_vpslld_var:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
+; CHECK-NEXT: vpslld %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%amt = insertelement <8 x i32> undef, i32 %shift, i32 0
%tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
ret <8 x i32> %tmp
}
-; CHECK-LABEL: test_vpslld_var:
-; CHECK: vpslld %xmm0, %ymm1, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
+; CHECK-LABEL: test_slld_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_slld_3:
-; CHECK: vpslld $31, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_sllq_1:
-; CHECK-NOT: vpsllq $0, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_sllq_2:
-; CHECK: vpaddq %ymm0, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
+; CHECK-LABEL: test_sllq_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_sllq_3:
-; CHECK: vpsllq $63, %ymm0, %ymm0
-; CHECK: ret
-
; AVX2 Arithmetic Shift
define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sraw_1:
-; CHECK-NOT: vpsraw $0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsraw $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sraw_2:
-; CHECK: vpsraw $1, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
+; CHECK-LABEL: test_sraw_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_sraw_3:
-; CHECK: vpsraw $15, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srad_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srad_1:
-; CHECK-NOT: vpsrad $0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srad_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrad $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srad_2:
-; CHECK: vpsrad $1, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srad_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srad_3:
-; CHECK: vpsrad $31, %ymm0, %ymm0
-; CHECK: ret
-
; SSE Logical Shift Right
define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_srlw_1:
-; CHECK-NOT: vpsrlw $0, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_srlw_2:
-; CHECK: vpsrlw $1, %ymm0, %ymm0
-; CHECK: ret
-
define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
+; CHECK-LABEL: test_srlw_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlw $15, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test_srlw_3:
-; CHECK: vpsrlw $15, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srld_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srld_1:
-; CHECK-NOT: vpsrld $0, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srld_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrld $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srld_2:
-; CHECK: vpsrld $1, %ymm0, %ymm0
-; CHECK: ret
-
define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
+; CHECK-LABEL: test_srld_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrld $31, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test_srld_3:
-; CHECK: vpsrld $31, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_srlq_1:
-; CHECK-NOT: vpsrlq $0, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlq $1, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_srlq_2:
-; CHECK: vpsrlq $1, %ymm0, %ymm0
-; CHECK: ret
-
define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
+; CHECK-LABEL: test_srlq_3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpsrlq $63, %ymm0, %ymm0
+; CHECK-NEXT: retq
entry:
%shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
ret <4 x i64> %shl
}
-; CHECK-LABEL: test_srlq_3:
-; CHECK: vpsrlq $63, %ymm0, %ymm0
-; CHECK: ret
-
-; CHECK-LABEL: @srl_trunc_and_v4i64
-; CHECK: vpand
-; CHECK-NEXT: vpsrlvd
-; CHECK: ret
define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: srl_trunc_and_v4i64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
%trunc = trunc <4 x i64> %and to <4 x i32>
%sra = lshr <4 x i32> %x, %trunc
@@ -272,156 +272,171 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
;
define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: shl_8i16
-; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK: retq
+; CHECK-LABEL: shl_8i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%shl = shl <8 x i16> %r, %a
ret <8 x i16> %shl
}
define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: shl_16i16
-; CHECK: vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: shl_16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; CHECK-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
+; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
+; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shl = shl <16 x i16> %r, %a
ret <16 x i16> %shl
}
define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: shl_32i8
-; CHECK: vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpsllw $2, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: shl_32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
+; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpsllw $2, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%shl = shl <32 x i8> %r, %a
ret <32 x i8> %shl
}
define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: ashr_8i16
-; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK: retq
+; CHECK-LABEL: ashr_8i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%ashr = ashr <8 x i16> %r, %a
ret <8 x i16> %ashr
}
define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: ashr_16i16
-; CHECK: vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: ashr_16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; CHECK-NEXT: vpsravd %ymm3, %ymm4, %ymm3
+; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
+; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: retq
%ashr = ashr <16 x i16> %r, %a
ret <16 x i16> %ashr
}
define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: ashr_32i8
-; CHECK: vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4
-; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4
-; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4
-; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
-; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
-; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3
-; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3
-; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3
-; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
-; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: ashr_32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
+; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; CHECK-NEXT: vpsraw $4, %ymm3, %ymm4
+; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; CHECK-NEXT: vpsraw $2, %ymm3, %ymm4
+; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; CHECK-NEXT: vpsraw $1, %ymm3, %ymm4
+; CHECK-NEXT: vpaddw %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; CHECK-NEXT: vpsrlw $8, %ymm2, %ymm2
+; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; CHECK-NEXT: vpsraw $4, %ymm0, %ymm3
+; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpsraw $2, %ymm0, %ymm3
+; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpsraw $1, %ymm0, %ymm3
+; CHECK-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $8, %ymm0, %ymm0
+; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%ashr = ashr <32 x i8> %r, %a
ret <32 x i8> %ashr
}
define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
-; CHECK-LABEL: lshr_8i16
-; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK: retq
+; CHECK-LABEL: lshr_8i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%lshr = lshr <8 x i16> %r, %a
ret <8 x i16> %lshr
}
define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
-; CHECK-LABEL: lshr_16i16
-; CHECK: vpxor %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
-; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
-; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: lshr_16i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
+; CHECK-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
+; CHECK-NEXT: vpsrld $16, %ymm3, %ymm3
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrld $16, %ymm0, %ymm0
+; CHECK-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
+; CHECK-NEXT: retq
%lshr = lshr <16 x i16> %r, %a
ret <16 x i16> %lshr
}
define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
-; CHECK-LABEL: lshr_32i8
-; CHECK: vpsllw $5, %ymm1, %ymm1
-; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpsrlw $2, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm2
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: lshr_32i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
+; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $2, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: retq
%lshr = lshr <32 x i8> %r, %a
ret <32 x i8> %lshr
}
diff --git a/test/CodeGen/X86/avx2-vperm.ll b/test/CodeGen/X86/avx2-vperm.ll
index d576d0e3741e..cba8bbe4af40 100755
--- a/test/CodeGen/X86/avx2-vperm.ll
+++ b/test/CodeGen/X86/avx2-vperm.ll
@@ -1,34 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_int_8x32:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_int_8x32
-; CHECK: vpermd
%B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> <i32 0, i32 7, i32 2, i32 1, i32 2, i32 7, i32 6, i32 0>
ret <8 x i32> %B
}
define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_fp_8x32:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_fp_8x32
-; CHECK: vpermps
%B = shufflevector <8 x float> %A, <8 x float> undef, <8 x i32> <i32 undef, i32 7, i32 2, i32 undef, i32 4, i32 undef, i32 1, i32 6>
ret <8 x float> %B
}
define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_int_4x64:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_int_4x64
-; CHECK: vpermq
%B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
ret <4 x i64> %B
}
define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone {
+; CHECK-LABEL: perm_cl_fp_4x64:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; CHECK-NEXT: retq
entry:
-; CHECK: perm_cl_fp_4x64
-; CHECK: vpermpd
%B = shufflevector <4 x double> %A, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
ret <4 x double> %B
}
diff --git a/test/CodeGen/X86/avx512-any_extend_load.ll b/test/CodeGen/X86/avx512-any_extend_load.ll
new file mode 100644
index 000000000000..b4336a86f6b4
--- /dev/null
+++ b/test/CodeGen/X86/avx512-any_extend_load.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gn -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gn -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+
+
+define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
+; ALL-LABEL: any_extend_load_v8i64:
+; ALL: # BB#0:
+; ALL-NEXT: vpmovzxbq (%rdi), %zmm0
+; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT: vpmovqb %zmm0, (%rdi)
+; ALL-NEXT: retq
+ %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
+ %1 = zext <8 x i8> %wide.load to <8 x i64>
+ %2 = add nuw nsw <8 x i64> %1, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
+ %3 = xor <8 x i64> %2, zeroinitializer
+ %4 = trunc <8 x i64> %3 to <8 x i8>
+ store <8 x i8> %4, <8 x i8>* %ptr, align 1
+ ret void
+}
+
+define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
+; KNL-LABEL: any_extend_load_v8i32:
+; KNL: # BB#0:
+; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: any_extend_load_v8i32:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovzxbd (%rdi), %ymm0
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; SKX-NEXT: vpmovdb %ymm0, (%rdi)
+; SKX-NEXT: retq
+ %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
+ %1 = zext <8 x i8> %wide.load to <8 x i32>
+ %2 = add nuw nsw <8 x i32> %1, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ %3 = xor <8 x i32> %2, zeroinitializer
+ %4 = trunc <8 x i32> %3 to <8 x i8>
+ store <8 x i8> %4, <8 x i8>* %ptr, align 1
+ ret void
+}
+
+define void @any_extend_load_v8i16(<8 x i8> * %ptr) {
+; KNL-LABEL: any_extend_load_v8i16:
+; KNL: # BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: any_extend_load_v8i16:
+; SKX: # BB#0:
+; SKX-NEXT: vpmovzxbw (%rdi), %xmm0
+; SKX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT: vpmovwb %xmm0, (%rdi)
+; SKX-NEXT: retq
+ %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
+ %1 = zext <8 x i8> %wide.load to <8 x i16>
+ %2 = add nuw nsw <8 x i16> %1, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+ %3 = xor <8 x i16> %2, zeroinitializer
+ %4 = trunc <8 x i16> %3 to <8 x i8>
+ store <8 x i8> %4, <8 x i8>* %ptr, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 9220e4f269cd..62dece137cc0 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -94,10 +94,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1
; AVX512F-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
@@ -107,10 +107,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512VL-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
; AVX512VL-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm1
; AVX512VL-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
; AVX512VL-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512VL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
@@ -120,10 +120,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1
; AVX512BW-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
@@ -140,6 +140,128 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
ret <8 x i64>%z
}
+define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
+; AVX512F-LABEL: imulq256:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512F-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: imulq256:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512VL-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: imulq256:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512BW-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512BW-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: imulq256:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm2
+; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
+; AVX512DQ-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: retq
+;
+; SKX-LABEL: imulq256:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0
+; SKX-NEXT: retq
+ %z = mul <4 x i64>%x, %y
+ ret <4 x i64>%z
+}
+
+define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
+; AVX512F-LABEL: imulq128:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512F-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: imulq128:
+; AVX512VL: ## BB#0:
+; AVX512VL-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512VL-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: imulq128:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: imulq128:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm2
+; AVX512DQ-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX512DQ-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX512DQ-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX512DQ-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; SKX-LABEL: imulq128:
+; SKX: ## BB#0:
+; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0
+; SKX-NEXT: retq
+ %z = mul <2 x i64>%x, %y
+ ret <2 x i64>%z
+}
+
define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
; CHECK-LABEL: mulpd512:
; CHECK: ## BB#0: ## %entry
@@ -553,6 +675,7 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vminpd:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512F-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -560,13 +683,14 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512VL-LABEL: test_mask_vminpd:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxord %ymm4, %ymm4, %ymm4
; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; AVX512VL-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vminpd:
; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512BW-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -574,6 +698,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512DQ-LABEL: test_mask_vminpd:
; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512DQ-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -581,7 +706,7 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
;
; SKX-LABEL: test_mask_vminpd:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT: vpxord %ymm4, %ymm4, %ymm4
; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -613,6 +738,7 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vmaxpd:
; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512F-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -620,13 +746,14 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512VL-LABEL: test_mask_vmaxpd:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxord %ymm4, %ymm4, %ymm4
; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vmaxpd:
; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -634,6 +761,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
;
; AVX512DQ-LABEL: test_mask_vmaxpd:
; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: ## kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512DQ-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
@@ -641,7 +769,7 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
;
; SKX-LABEL: test_mask_vmaxpd:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; SKX-NEXT: vpxord %ymm4, %ymm4, %ymm4
; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-bugfix-23634.ll b/test/CodeGen/X86/avx512-bugfix-23634.ll
index c31a13ad3114..0dcfb7c169f3 100644
--- a/test/CodeGen/X86/avx512-bugfix-23634.ll
+++ b/test/CodeGen/X86/avx512-bugfix-23634.ll
@@ -1,13 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-; CHECK-LABEL: f_fu
-; CHECK-NOT: vpblend
-; CHECK: vmovdqa32 {{.*}} {%k1}
-
define void @f_fu(float* %ret, float* %aa, float %b) {
+; CHECK-LABEL: f_fu:
+; CHECK: ## BB#0: ## %allocas
+; CHECK-NEXT: vcvttss2si %xmm0, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %zmm0
+; CHECK-NEXT: vcvttps2dq (%rsi), %zmm1
+; CHECK-NEXT: vpsrld $31, %zmm0, %zmm2
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm2
+; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2
+; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: vpblendmd {{.*}}(%rip), %zmm1, %zmm1 {%k1}
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT: vmovups %zmm0, (%rdi)
+; CHECK-NEXT: retq
allocas:
%ptr_cast_for_load = bitcast float* %aa to <16 x float>*
%ptr_masked_load.39 = load <16 x float>, <16 x float>* %ptr_cast_for_load, align 4
@@ -23,13 +36,13 @@ allocas:
%v1.i = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>, <16 x i32> %a_load_to_int32
- %foo_test = add <16 x i32> %div_v019_load_, %b_load_to_int32_broadcast
+ %foo_test = add <16 x i32> %div_v019_load_, %b_load_to_int32_broadcast
- %add_struct_offset_y_struct_offset33_x = add <16 x i32> %foo_test, %v1.i
+ %add_struct_offset_y_struct_offset33_x = add <16 x i32> %foo_test, %v1.i
%val = sitofp <16 x i32> %add_struct_offset_y_struct_offset33_x to <16 x float>
%ptrcast = bitcast float* %ret to <16 x float>*
store <16 x float> %val, <16 x float>* %ptrcast, align 4
ret void
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/X86/avx512-bugfix-26264.ll b/test/CodeGen/X86/avx512-bugfix-26264.ll
new file mode 100644
index 000000000000..b3e1b17076bb
--- /dev/null
+++ b/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw < %s | FileCheck %s --check-prefix=AVX512BW
+
+define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
+; AVX512BW-LABEL: test_load_32f64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
+; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovaps %zmm2, %zmm1
+; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
+; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
+; AVX512BW-NEXT: retq
+ %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
+ ret <32 x double> %res
+}
+
+define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64> %src0) {
+; AVX512BW-LABEL: test_load_32i64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm3 {%k2}
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k1
+; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: vmovaps %zmm2, %zmm1
+; AVX512BW-NEXT: vmovaps %zmm3, %zmm2
+; AVX512BW-NEXT: vmovaps %zmm4, %zmm3
+; AVX512BW-NEXT: retq
+ %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
+ ret <32 x i64> %res
+}
+
+declare <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32, <32 x i1> %mask, <32 x i64> %src0)
+declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index 0f89aa71162e..980b87187d98 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -4,7 +4,8 @@
define <16 x i32> @test2(<16 x i32> %x) {
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
ret <16 x i32>%res
@@ -15,8 +16,8 @@ define <16 x float> @test3(<4 x float> %a) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index a61aeba5aff9..fce592a5318b 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -1,13 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
-; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
define <16 x i1> @test1() {
-; ALL_X64-LABEL: test1:
-; ALL_X64: ## BB#0:
-; ALL_X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; ALL_X64-NEXT: retq
+; KNL-LABEL: test1:
+; KNL: ## BB#0:
+; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; SKX-NEXT: retq
;
; KNL_X32-LABEL: test1:
; KNL_X32: ## BB#0:
@@ -25,7 +30,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -47,7 +53,8 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpbroadcastd LCPI1_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <16 x i1>%a, %b
@@ -63,7 +70,8 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -86,8 +94,8 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL_X32-NEXT: vpsllvq %zmm2, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpbroadcastd LCPI2_1, %zmm0
-; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <8 x i1>%a, %b
@@ -102,11 +110,10 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
;
; SKX-LABEL: test4:
; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k0
-; SKX-NEXT: vpslld $31, %xmm1, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k0 {%k1}
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
;
@@ -128,6 +135,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: callq _func8xi1
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-NEXT: vpslld $31, %ymm0, %ymm0
@@ -143,7 +151,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: callq _func8xi1
-; SKX-NEXT: vpmovzxwd %xmm0, %ymm0
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: vpslld $31, %ymm0, %ymm0
; SKX-NEXT: vpsrad $31, %ymm0, %ymm0
; SKX-NEXT: popq %rax
@@ -156,7 +164,8 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: calll L_func8xi1$stub
+; KNL_X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_X32-NEXT: calll _func8xi1
; KNL_X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_X32-NEXT: vpslld $31, %ymm0, %ymm0
; KNL_X32-NEXT: vpsrad $31, %ymm0, %ymm0
@@ -177,10 +186,11 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: Ltmp1:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: callq _func16xi1
-; KNL-NEXT: vpmovzxbd %xmm0, %zmm0
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vpsrad $31, %zmm0, %zmm0
; KNL-NEXT: popq %rax
@@ -194,7 +204,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: callq _func16xi1
-; SKX-NEXT: vpmovzxbd %xmm0, %zmm0
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: vpslld $31, %zmm0, %zmm0
; SKX-NEXT: vpsrad $31, %zmm0, %zmm0
; SKX-NEXT: popq %rax
@@ -206,10 +216,11 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL_X32-NEXT: Ltmp1:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
-; KNL_X32-NEXT: vpbroadcastd LCPI5_0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
-; KNL_X32-NEXT: calll L_func16xi1$stub
-; KNL_X32-NEXT: vpmovzxbd %xmm0, %zmm0
+; KNL_X32-NEXT: calll _func16xi1
+; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
; KNL_X32-NEXT: vpsrad $31, %zmm0, %zmm0
; KNL_X32-NEXT: addl $12, %esp
@@ -254,7 +265,7 @@ define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
; KNL_X32-NEXT: Ltmp2:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; KNL_X32-NEXT: calll L_func4xi1$stub
+; KNL_X32-NEXT: calll _func4xi1
; KNL_X32-NEXT: vpslld $31, %xmm0, %xmm0
; KNL_X32-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp
@@ -273,14 +284,15 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: callq _func8xi1
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: movb $85, %al
-; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
@@ -309,15 +321,15 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: calll L_func8xi1$stub
+; KNL_X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_X32-NEXT: calll _func8xi1
; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL_X32-NEXT: vpsllvq LCPI7_0, %zmm0, %zmm0
; KNL_X32-NEXT: movb $85, %al
-; KNL_X32-NEXT: movzbl %al, %eax
; KNL_X32-NEXT: kmovw %eax, %k1
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL_X32-NEXT: vpbroadcastd LCPI7_1, %zmm0
-; KNL_X32-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp
; KNL_X32-NEXT: retl
@@ -328,14 +340,23 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
}
define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) {
-; ALL_X64-LABEL: test8:
-; ALL_X64: ## BB#0:
-; ALL_X64-NEXT: testb $1, %dil
-; ALL_X64-NEXT: jne LBB8_2
-; ALL_X64-NEXT: ## BB#1:
-; ALL_X64-NEXT: vmovaps %zmm1, %zmm0
-; ALL_X64-NEXT: LBB8_2:
-; ALL_X64-NEXT: retq
+; KNL-LABEL: test8:
+; KNL: ## BB#0:
+; KNL-NEXT: testb $1, %dil
+; KNL-NEXT: jne LBB8_2
+; KNL-NEXT: ## BB#1:
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: LBB8_2:
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test8:
+; SKX: ## BB#0:
+; SKX-NEXT: testb $1, %dil
+; SKX-NEXT: jne LBB8_2
+; SKX-NEXT: ## BB#1:
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: LBB8_2:
+; SKX-NEXT: retq
;
; KNL_X32-LABEL: test8:
; KNL_X32: ## BB#0:
@@ -358,7 +379,7 @@ define i1 @test9(double %a, double %b) {
;
; KNL_X32-LABEL: test9:
; KNL_X32: ## BB#0:
-; KNL_X32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0
+; KNL_X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; KNL_X32-NEXT: vucomisd {{[0-9]+}}(%esp), %xmm0
; KNL_X32-NEXT: setb %al
; KNL_X32-NEXT: retl
@@ -464,7 +485,7 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; KNL_X32-NEXT: movl %edi, (%esp)
; KNL_X32-NEXT: calll _test11
-; KNL_X32-NEXT: movb %al, %bl
+; KNL_X32-NEXT: movl %eax, %ebx
; KNL_X32-NEXT: movzbl %bl, %eax
; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; KNL_X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index 6e0d18558c51..fceb9c14b7df 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -1,8 +1,19 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; CHECK-LABEL: test1
-; CHECK: vucomisd {{.*}}encoding: [0x62
define double @test1(double %a, double %b) nounwind {
+; ALL-LABEL: test1:
+; ALL: ## BB#0:
+; ALL-NEXT: vucomisd %xmm1, %xmm0
+; ALL-NEXT: jne LBB0_1
+; ALL-NEXT: jnp LBB0_2
+; ALL-NEXT: LBB0_1: ## %l1
+; ALL-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
+; ALL-NEXT: LBB0_2: ## %l2
+; ALL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%tobool = fcmp une double %a, %b
br i1 %tobool, label %l1, label %l2
@@ -14,9 +25,17 @@ l2:
ret double %c1
}
-; CHECK-LABEL: test2
-; CHECK: vucomiss {{.*}}encoding: [0x62
define float @test2(float %a, float %b) nounwind {
+; ALL-LABEL: test2:
+; ALL: ## BB#0:
+; ALL-NEXT: vucomiss %xmm0, %xmm1
+; ALL-NEXT: jbe LBB1_2
+; ALL-NEXT: ## BB#1: ## %l1
+; ALL-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
+; ALL-NEXT: LBB1_2: ## %l2
+; ALL-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; ALL-NEXT: retq
%tobool = fcmp olt float %a, %b
br i1 %tobool, label %l1, label %l2
@@ -29,18 +48,35 @@ l2:
}
; FIXME: Can use vcmpeqss and extract from the mask here in AVX512.
-; CHECK-LABEL: test3
-; CHECK: vucomiss {{.*}}encoding: [0x62
define i32 @test3(float %a, float %b) {
+; ALL-LABEL: test3:
+; ALL: ## BB#0:
+; ALL-NEXT: vucomiss %xmm1, %xmm0
+; ALL-NEXT: setnp %al
+; ALL-NEXT: sete %cl
+; ALL-NEXT: andb %al, %cl
+; ALL-NEXT: movzbl %cl, %eax
+; ALL-NEXT: retq
%cmp10.i = fcmp oeq float %a, %b
%conv11.i = zext i1 %cmp10.i to i32
ret i32 %conv11.i
}
-; CHECK-LABEL: test5
-; CHECK: ret
define float @test5(float %p) #0 {
+; ALL-LABEL: test5:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vucomiss %xmm1, %xmm0
+; ALL-NEXT: jne LBB3_1
+; ALL-NEXT: jnp LBB3_2
+; ALL-NEXT: LBB3_1: ## %if.end
+; ALL-NEXT: seta %al
+; ALL-NEXT: movzbl %al, %eax
+; ALL-NEXT: leaq {{.*}}(%rip), %rcx
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: LBB3_2: ## %return
+; ALL-NEXT: retq
entry:
%cmp = fcmp oeq float %p, 0.000000e+00
br i1 %cmp, label %return, label %if.end
@@ -55,21 +91,25 @@ return: ; preds = %if.end, %entry
ret float %retval.0
}
-; CHECK-LABEL: test6
-; CHECK: cmpl
-; CHECK-NOT: kmov
-; CHECK: ret
define i32 @test6(i32 %a, i32 %b) {
+; ALL-LABEL: test6:
+; ALL: ## BB#0:
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: sete %al
+; ALL-NEXT: retq
%cmp = icmp eq i32 %a, %b
%res = zext i1 %cmp to i32
ret i32 %res
}
-; CHECK-LABEL: test7
-; CHECK: vucomisd
-; CHECK-NOT: kmov
-; CHECK: ret
define i32 @test7(double %x, double %y) #2 {
+; ALL-LABEL: test7:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: vucomisd %xmm1, %xmm0
+; ALL-NEXT: setne %al
+; ALL-NEXT: retq
entry:
%0 = fcmp one double %x, %y
%or = zext i1 %0 to i32
@@ -77,6 +117,16 @@ entry:
}
define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
+; ALL-LABEL: test8:
+; ALL: ## BB#0:
+; ALL-NEXT: testl %edx, %edx
+; ALL-NEXT: movl $1, %eax
+; ALL-NEXT: cmovel %eax, %edx
+; ALL-NEXT: cmpl $-2147483648, %esi ## imm = 0x80000000
+; ALL-NEXT: cmovnel %edx, %eax
+; ALL-NEXT: cmpl $-1, %edi
+; ALL-NEXT: cmovnel %edx, %eax
+; ALL-NEXT: retq
%tmp1 = icmp eq i32 %a1, -1
%tmp2 = icmp eq i32 %a2, -2147483648
%tmp3 = and i1 %tmp1, %tmp2
@@ -86,11 +136,17 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
ret i32 %res
}
-; CHECK-LABEL: test9
-; CHECK: testb
-; CHECK-NOT: kmov
-; CHECK: ret
define i32 @test9(i64 %a) {
+; ALL-LABEL: test9:
+; ALL: ## BB#0:
+; ALL-NEXT: testb $1, %dil
+; ALL-NEXT: jne LBB7_2
+; ALL-NEXT: ## BB#1: ## %A
+; ALL-NEXT: movl $6, %eax
+; ALL-NEXT: retq
+; ALL-NEXT: LBB7_2: ## %B
+; ALL-NEXT: movl $7, %eax
+; ALL-NEXT: retq
%b = and i64 %a, 1
%cmp10.i = icmp eq i64 %b, 0
br i1 %cmp10.i, label %A, label %B
@@ -99,3 +155,35 @@ A:
B:
ret i32 7
}
+
+define i32 @test10(i64 %b, i64 %c, i1 %d) {
+; ALL-LABEL: test10:
+; ALL: ## BB#0:
+; ALL-NEXT: andl $1, %edx
+; ALL-NEXT: kmovw %edx, %k0
+; ALL-NEXT: cmpq %rsi, %rdi
+; ALL-NEXT: sete %al
+; ALL-NEXT: kmovw %eax, %k1
+; ALL-NEXT: korw %k1, %k0, %k1
+; ALL-NEXT: kxorw %k1, %k0, %k0
+; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: testb %al, %al
+; ALL-NEXT: je LBB8_1
+; ALL-NEXT: ## BB#2: ## %if.end.i
+; ALL-NEXT: movl $6, %eax
+; ALL-NEXT: retq
+; ALL-NEXT: LBB8_1: ## %if.then.i
+; ALL-NEXT: movl $5, %eax
+; ALL-NEXT: retq
+
+ %cmp8.i = icmp eq i64 %b, %c
+ %or1 = or i1 %d, %cmp8.i
+ %xor1 = xor i1 %d, %or1
+ br i1 %xor1, label %if.end.i, label %if.then.i
+
+if.then.i:
+ ret i32 5
+
+if.end.i:
+ ret i32 6
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 586a29545014..914f859927be 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,228 +1,511 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; CHECK-LABEL: sitof32
-; CHECK: vcvtdq2ps %zmm
-; CHECK: ret
define <16 x float> @sitof32(<16 x i32> %a) nounwind {
+; ALL-LABEL: sitof32:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = sitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
-; CHECK-LABEL: sltof864
-; CHECK: vcvtqq2pd
define <8 x double> @sltof864(<8 x i64> %a) {
+; KNL-LABEL: sltof864:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; KNL-NEXT: vpextrq $1, %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof864:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0
+; SKX-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
-; CHECK-LABEL: sltof464
-; CHECK: vcvtqq2pd
define <4 x double> @sltof464(<4 x i64> %a) {
+; KNL-LABEL: sltof464:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpextrq $1, %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm1, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof464:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2pd %ymm0, %ymm0
+; SKX-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x double>
ret <4 x double> %b
}
-; CHECK-LABEL: sltof2f32
-; CHECK: vcvtqq2ps
define <2 x float> @sltof2f32(<2 x i64> %a) {
+; KNL-LABEL: sltof2f32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof2f32:
+; SKX: ## BB#0:
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x float>
ret <2 x float>%b
}
-; CHECK-LABEL: sltof4f32_mem
-; CHECK: vcvtqq2psy (%rdi)
define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
+; KNL-LABEL: sltof4f32_mem:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof4f32_mem:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2psy (%rdi), %xmm0
+; SKX-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
}
-; CHECK-LABEL: f64tosl
-; CHECK: vcvttpd2qq
define <4 x i64> @f64tosl(<4 x double> %a) {
+; KNL-LABEL: f64tosl:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; KNL-NEXT: vcvttsd2si %xmm1, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; KNL-NEXT: vcvttsd2si %xmm1, %rax
+; KNL-NEXT: vmovq %rax, %xmm1
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; KNL-NEXT: vcvttsd2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; KNL-NEXT: vcvttsd2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm0
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: f64tosl:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttpd2qq %ymm0, %ymm0
+; SKX-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i64>
ret <4 x i64> %b
}
-; CHECK-LABEL: f32tosl
-; CHECK: vcvttps2qq
define <4 x i64> @f32tosl(<4 x float> %a) {
+; KNL-LABEL: f32tosl:
+; KNL: ## BB#0:
+; KNL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; KNL-NEXT: vcvttss2si %xmm1, %rax
+; KNL-NEXT: vmovq %rax, %xmm1
+; KNL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; KNL-NEXT: vcvttss2si %xmm2, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; KNL-NEXT: vcvttss2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm2
+; KNL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vcvttss2si %xmm0, %rax
+; KNL-NEXT: vmovq %rax, %xmm0
+; KNL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: f32tosl:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttps2qq %xmm0, %ymm0
+; SKX-NEXT: retq
%b = fptosi <4 x float> %a to <4 x i64>
ret <4 x i64> %b
}
-; CHECK-LABEL: sltof432
-; CHECK: vcvtqq2ps
define <4 x float> @sltof432(<4 x i64> %a) {
+; KNL-LABEL: sltof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sltof432:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
-; CHECK-LABEL: ultof432
-; CHECK: vcvtuqq2ps
define <4 x float> @ultof432(<4 x i64> %a) {
+; KNL-LABEL: ultof432:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm1
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2ssq %rax, %xmm0, %xmm0
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ultof432:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtuqq2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
-; CHECK-LABEL: ultof64
-; CHECK: vcvtuqq2pd
define <8 x double> @ultof64(<8 x i64> %a) {
+; KNL-LABEL: ultof64:
+; KNL: ## BB#0:
+; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; KNL-NEXT: vpextrq $1, %xmm1, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vmovq %xmm1, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm1
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; KNL-NEXT: vpextrq $1, %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm2, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm2
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm3
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vcvtusi2sdq %rax, %xmm0, %xmm0
+; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ultof64:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0
+; SKX-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
-; CHECK-LABEL: fptosi00
-; CHECK: vcvttps2dq %zmm
-; CHECK: ret
define <16 x i32> @fptosi00(<16 x float> %a) nounwind {
+; ALL-LABEL: fptosi00:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: retq
%b = fptosi <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
-; CHECK-LABEL: fptoui00
-; CHECK: vcvttps2udq
-; CHECK: ret
define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
+; ALL-LABEL: fptoui00:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttps2udq %zmm0, %zmm0
+; ALL-NEXT: retq
%b = fptoui <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
-; CHECK-LABEL: fptoui_256
-; CHECK: vcvttps2udq
-; CHECK: ret
define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
+; KNL-LABEL: fptoui_256:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptoui_256:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttps2udq %ymm0, %ymm0
+; SKX-NEXT: retq
%b = fptoui <8 x float> %a to <8 x i32>
ret <8 x i32> %b
}
-; CHECK-LABEL: fptoui_128
-; CHECK: vcvttps2udq
-; CHECK: ret
define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
+; KNL-LABEL: fptoui_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptoui_128:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttps2udq %xmm0, %xmm0
+; SKX-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
-; CHECK-LABEL: fptoui01
-; CHECK: vcvttpd2udq
-; CHECK: ret
define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
+; ALL-LABEL: fptoui01:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; ALL-NEXT: retq
%b = fptoui <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
-; CHECK-LABEL: sitof64
-; CHECK: vcvtdq2pd %ymm
-; CHECK: ret
+define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
+; KNL-LABEL: fptoui_256d:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptoui_256d:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttpd2udq %ymm0, %xmm0
+; SKX-NEXT: retq
+ %b = fptoui <4 x double> %a to <4 x i32>
+ ret <4 x i32> %b
+}
+
define <8 x double> @sitof64(<8 x i32> %a) {
+; ALL-LABEL: sitof64:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%b = sitofp <8 x i32> %a to <8 x double>
ret <8 x double> %b
}
-; CHECK-LABEL: fptosi01
-; CHECK: vcvttpd2dq %zmm
-; CHECK: ret
define <8 x i32> @fptosi01(<8 x double> %a) {
+; ALL-LABEL: fptosi01:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; ALL-NEXT: retq
%b = fptosi <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
-; CHECK-LABEL: fptosi03
-; CHECK: vcvttpd2dq %ymm
-; CHECK: ret
define <4 x i32> @fptosi03(<4 x double> %a) {
+; KNL-LABEL: fptosi03:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttpd2dqy %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptosi03:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; SKX-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
-; CHECK-LABEL: fptrunc00
-; CHECK: vcvtpd2ps %zmm
-; CHECK-NEXT: vcvtpd2ps %zmm
-; CHECK-NEXT: vinsertf
-; CHECK: ret
define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
+; KNL-LABEL: fptrunc00:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2ps %zmm0, %ymm0
+; KNL-NEXT: vcvtpd2ps %zmm1, %ymm1
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptrunc00:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0
+; SKX-NEXT: vcvtpd2ps %zmm1, %ymm1
+; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT: retq
%a = fptrunc <16 x double> %b to <16 x float>
ret <16 x float> %a
}
-; CHECK-LABEL: fptrunc01
-; CHECK: vcvtpd2ps %ymm
define <4 x float> @fptrunc01(<4 x double> %b) {
+; KNL-LABEL: fptrunc01:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2psy %ymm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptrunc01:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0
+; SKX-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
ret <4 x float> %a
}
-; CHECK-LABEL: fptrunc02
-; CHECK: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
+; KNL-LABEL: fptrunc02:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL-NEXT: vcvtpd2psy %ymm0, %xmm0
+; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fptrunc02:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
}
-; CHECK-LABEL: fpext00
-; CHECK: vcvtps2pd %ymm0, %zmm0
-; CHECK: ret
define <8 x double> @fpext00(<8 x float> %b) nounwind {
+; ALL-LABEL: fpext00:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%a = fpext <8 x float> %b to <8 x double>
ret <8 x double> %a
}
-; CHECK-LABEL: fpext01
-; CHECK: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
-; CHECK: ret
define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
+; KNL-LABEL: fpext01:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtps2pd %xmm0, %ymm0
+; KNL-NEXT: vcmpltpd %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; KNL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fpext01:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd %ymm2, %ymm1, %k1
+; SKX-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: retq
%a = fpext <4 x float> %b to <4 x double>
%mask = fcmp ogt <4 x double>%a1, %b1
%c = select <4 x i1>%mask, <4 x double>%a, <4 x double>zeroinitializer
ret <4 x double> %c
}
-; CHECK-LABEL: funcA
-; CHECK: vcvtsi2sdq (%rdi){{.*}} encoding: [0x62
-; CHECK: ret
define double @funcA(i64* nocapture %e) {
+; ALL-LABEL: funcA:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i64, i64* %e, align 8
%conv = sitofp i64 %tmp1 to double
ret double %conv
}
-; CHECK-LABEL: funcB
-; CHECK: vcvtsi2sdl (%{{.*}} encoding: [0x62
-; CHECK: ret
define double @funcB(i32* %e) {
+; ALL-LABEL: funcB:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i32, i32* %e, align 4
%conv = sitofp i32 %tmp1 to double
ret double %conv
}
-; CHECK-LABEL: funcC
-; CHECK: vcvtsi2ssl (%{{.*}} encoding: [0x62
-; CHECK: ret
define float @funcC(i32* %e) {
+; ALL-LABEL: funcC:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i32, i32* %e, align 4
%conv = sitofp i32 %tmp1 to float
ret float %conv
}
-; CHECK-LABEL: i64tof32
-; CHECK: vcvtsi2ssq (%{{.*}} encoding: [0x62
-; CHECK: ret
define float @i64tof32(i64* %e) {
+; ALL-LABEL: i64tof32:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; ALL-NEXT: retq
entry:
%tmp1 = load i64, i64* %e, align 8
%conv = sitofp i64 %tmp1 to float
ret float %conv
}
-; CHECK-LABEL: fpext
-; CHECK: vcvtss2sd {{.*}} encoding: [0x62
-; CHECK: ret
define void @fpext() {
+; ALL-LABEL: fpext:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: retq
entry:
%f = alloca float, align 4
%d = alloca double, align 8
@@ -232,12 +515,13 @@ entry:
ret void
}
-; CHECK-LABEL: fpround_scalar
-; CHECK: vmovsd {{.*}} encoding: [0x62
-; CHECK: vcvtsd2ss {{.*}} encoding: [0x62
-; CHECK: vmovss {{.*}} encoding: [0x62
-; CHECK: ret
define void @fpround_scalar() nounwind uwtable {
+; ALL-LABEL: fpround_scalar:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: retq
entry:
%f = alloca float, align 4
%d = alloca double, align 8
@@ -247,179 +531,258 @@ entry:
ret void
}
-; CHECK-LABEL: long_to_double
-; CHECK: vmovq {{.*}} encoding: [0x62
-; CHECK: ret
define double @long_to_double(i64 %x) {
+; ALL-LABEL: long_to_double:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovq %rdi, %xmm0
+; ALL-NEXT: retq
%res = bitcast i64 %x to double
ret double %res
}
-; CHECK-LABEL: double_to_long
-; CHECK: vmovq {{.*}} encoding: [0x62
-; CHECK: ret
define i64 @double_to_long(double %x) {
+; ALL-LABEL: double_to_long:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: retq
%res = bitcast double %x to i64
ret i64 %res
}
-; CHECK-LABEL: int_to_float
-; CHECK: vmovd {{.*}} encoding: [0x62
-; CHECK: ret
define float @int_to_float(i32 %x) {
+; ALL-LABEL: int_to_float:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovd %edi, %xmm0
+; ALL-NEXT: retq
%res = bitcast i32 %x to float
ret float %res
}
-; CHECK-LABEL: float_to_int
-; CHECK: vmovd {{.*}} encoding: [0x62
-; CHECK: ret
define i32 @float_to_int(float %x) {
+; ALL-LABEL: float_to_int:
+; ALL: ## BB#0:
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: retq
%res = bitcast float %x to i32
ret i32 %res
}
define <16 x double> @uitof64(<16 x i32> %a) nounwind {
-; CHECK-LABEL: uitof64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm2
-; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm0
-; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
+; KNL-LABEL: uitof64:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtudq2pd %ymm0, %zmm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vcvtudq2pd %ymm0, %zmm1
+; KNL-NEXT: vmovaps %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof64:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm2
+; SKX-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm1
+; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x double>
ret <16 x double> %b
}
-; CHECK-LABEL: uitof64_256
-; CHECK: vcvtudq2pd
-; CHECK: ret
define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
+; KNL-LABEL: uitof64_256:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof64_256:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0
+; SKX-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x double>
ret <4 x double> %b
}
-; CHECK-LABEL: uitof32
-; CHECK: vcvtudq2ps
-; CHECK: ret
define <16 x float> @uitof32(<16 x i32> %a) nounwind {
+; ALL-LABEL: uitof32:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
-; CHECK-LABEL: uitof32_256
-; CHECK: vcvtudq2ps
-; CHECK: ret
define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
+; KNL-LABEL: uitof32_256:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof32_256:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0
+; SKX-NEXT: retq
%b = uitofp <8 x i32> %a to <8 x float>
ret <8 x float> %b
}
-; CHECK-LABEL: uitof32_128
-; CHECK: vcvtudq2ps
-; CHECK: ret
define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
+; KNL-LABEL: uitof32_128:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: uitof32_128:
+; SKX: ## BB#0:
+; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0
+; SKX-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
-; CHECK-LABEL: @fptosi02
-; CHECK: vcvttss2si {{.*}} encoding: [0x62
-; CHECK: ret
define i32 @fptosi02(float %a) nounwind {
+; ALL-LABEL: fptosi02:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttss2si %xmm0, %eax
+; ALL-NEXT: retq
%b = fptosi float %a to i32
ret i32 %b
}
-; CHECK-LABEL: @fptoui02
-; CHECK: vcvttss2usi {{.*}} encoding: [0x62
-; CHECK: ret
define i32 @fptoui02(float %a) nounwind {
+; ALL-LABEL: fptoui02:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvttss2usi %xmm0, %eax
+; ALL-NEXT: retq
%b = fptoui float %a to i32
ret i32 %b
}
-; CHECK-LABEL: @uitofp02
-; CHECK: vcvtusi2ss
-; CHECK: ret
define float @uitofp02(i32 %a) nounwind {
+; ALL-LABEL: uitofp02:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
+; ALL-NEXT: retq
%b = uitofp i32 %a to float
ret float %b
}
-; CHECK-LABEL: @uitofp03
-; CHECK: vcvtusi2sd
-; CHECK: ret
define double @uitofp03(i32 %a) nounwind {
+; ALL-LABEL: uitofp03:
+; ALL: ## BB#0:
+; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
+; ALL-NEXT: retq
%b = uitofp i32 %a to double
ret double %b
}
-; CHECK-LABEL: @sitofp_16i1_float
-; CHECK: vpmovm2d
-; CHECK: vcvtdq2ps
define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
+; KNL-LABEL: sitofp_16i1_float:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sitofp_16i1_float:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
+; SKX-NEXT: vpmovm2d %k0, %zmm0
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0
+; SKX-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = sitofp <16 x i1> %mask to <16 x float>
ret <16 x float> %1
}
-; CHECK-LABEL: @sitofp_16i8_float
-; CHECK: vpmovsxbd
-; CHECK: vcvtdq2ps
define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
+; ALL-LABEL: sitofp_16i8_float:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxbd %xmm0, %zmm0
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <16 x i8> %a to <16 x float>
ret <16 x float> %1
}
-; CHECK-LABEL: @sitofp_16i16_float
-; CHECK: vpmovsxwd
-; CHECK: vcvtdq2ps
define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
+; ALL-LABEL: sitofp_16i16_float:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <16 x i16> %a to <16 x float>
ret <16 x float> %1
}
-; CHECK-LABEL: @sitofp_8i16_double
-; CHECK: vpmovsxwd
-; CHECK: vcvtdq2pd
define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
+; ALL-LABEL: sitofp_8i16_double:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <8 x i16> %a to <8 x double>
ret <8 x double> %1
}
-; CHECK-LABEL: sitofp_8i8_double
-; CHECK: vpmovzxwd
-; CHECK: vpslld
-; CHECK: vpsrad
-; CHECK: vcvtdq2pd
define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
+; ALL-LABEL: sitofp_8i8_double:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: vpslld $24, %ymm0, %ymm0
+; ALL-NEXT: vpsrad $24, %ymm0, %ymm0
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
%1 = sitofp <8 x i8> %a to <8 x double>
ret <8 x double> %1
}
-
-; CHECK-LABEL: @sitofp_8i1_double
-; CHECK: vpmovm2d
-; CHECK: vcvtdq2pd
define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
+; KNL-LABEL: sitofp_8i1_double:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: sitofp_8i1_double:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; SKX-NEXT: vpmovm2d %k0, %ymm0
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0
+; SKX-NEXT: retq
%cmpres = fcmp ogt <8 x double> %a, zeroinitializer
%1 = sitofp <8 x i1> %cmpres to <8 x double>
ret <8 x double> %1
}
-; CHECK-LABEL: @uitofp_16i8
-; CHECK: vpmovzxbd
-; CHECK: vcvtudq2ps
define <16 x float> @uitofp_16i8(<16 x i8>%a) {
+; ALL-LABEL: uitofp_16i8:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = uitofp <16 x i8> %a to <16 x float>
ret <16 x float>%b
}
-; CHECK-LABEL: @uitofp_16i16
-; CHECK: vpmovzxwd
-; CHECK: vcvtudq2ps
define <16 x float> @uitofp_16i16(<16 x i16>%a) {
+; ALL-LABEL: uitofp_16i16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
%b = uitofp <16 x i16> %a to <16 x float>
ret <16 x float>%b
}
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index bc1509684475..faac7b20fd61 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -15,7 +15,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbw (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = zext <8 x i8> %a to <8 x i16>
@@ -59,7 +59,7 @@ define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbw (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
%x = zext <16 x i8> %a to <16 x i16>
@@ -90,15 +90,10 @@ define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi
}
define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
-; KNL-LABEL: zext_16x8_to_16x16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: retq
-;
-; SKX-LABEL: zext_16x8_to_16x16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpmovzxbw %xmm0, %ymm0
-; SKX-NEXT: retq
+; ALL-LABEL: zext_16x8_to_16x16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; ALL-NEXT: retq
%x = zext <16 x i8> %a to <16 x i16>
ret <16 x i16> %x
}
@@ -117,7 +112,7 @@ define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
-; SKX-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; SKX-NEXT: retq
%x = zext <16 x i8> %a to <16 x i16>
%ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
@@ -175,7 +170,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
-; SKX-NEXT: vpmovzxbw (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
; SKX-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
%x = zext <32 x i8> %a to <32 x i16>
@@ -223,7 +218,7 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
;
; SKX-LABEL: zext_32x8_to_32x16:
; SKX: ## BB#0:
-; SKX-NEXT: vpmovzxbw %ymm0, %zmm0
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; SKX-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
@@ -250,7 +245,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
-; SKX-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; SKX-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
@@ -314,8 +309,8 @@ define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; SKX-LABEL: zext_4x8mem_to_4x32:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
%x = zext <4 x i8> %a to <4 x i32>
@@ -335,7 +330,7 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; SKX-LABEL: sext_4x8mem_to_4x32:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
@@ -353,13 +348,14 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x8mem_to_8x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = zext <8 x i8> %a to <8 x i32>
@@ -376,6 +372,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxbd (%rdi), %ymm0
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x8mem_to_8x32:
@@ -396,14 +393,14 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8mem_to_16x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; SKX-NEXT: retq
%a = load <16 x i8>,<16 x i8> *%i,align 1
%x = zext <16 x i8> %a to <16 x i32>
@@ -438,14 +435,14 @@ define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8_to_16x32_mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
-; SKX-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: retq
%x = zext <16 x i8> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
@@ -475,7 +472,7 @@ define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
; ALL-LABEL: zext_16x8_to_16x32:
; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxbd %xmm0, %zmm0
+; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; ALL-NEXT: retq
%x = zext <16 x i8> %i to <16 x i32>
ret <16 x i32> %x
@@ -503,8 +500,8 @@ define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind re
; SKX-LABEL: zext_2x8mem_to_2x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
%x = zext <2 x i8> %a to <2 x i64>
@@ -524,7 +521,7 @@ define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwin
; SKX-LABEL: sext_2x8mem_to_2x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
@@ -555,8 +552,8 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; SKX-LABEL: zext_4x8mem_to_4x64:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
%x = zext <4 x i8> %a to <4 x i64>
@@ -577,7 +574,7 @@ define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwin
; SKX-LABEL: sext_4x8mem_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
@@ -602,14 +599,14 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x8mem_to_8x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxbq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
%x = zext <8 x i8> %a to <8 x i64>
@@ -660,8 +657,8 @@ define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x16mem_to_4x32:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = zext <4 x i16> %a to <4 x i32>
@@ -681,7 +678,7 @@ define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounw
; SKX-LABEL: sext_4x16mem_to_4x32mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
@@ -710,13 +707,14 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16mem_to_8x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = zext <8 x i16> %a to <8 x i32>
@@ -733,6 +731,7 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw
; KNL-NEXT: vpmovsxwd (%rdi), %ymm0
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x16mem_to_8x32mask:
@@ -766,13 +765,14 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16_to_8x32mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
-; SKX-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX-NEXT: retq
%x = zext <8 x i16> %a to <8 x i32>
%ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
@@ -780,15 +780,10 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
}
define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
-; KNL-LABEL: zext_8x16_to_8x32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: retq
-;
-; SKX-LABEL: zext_8x16_to_8x32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpmovzxwd %xmm0, %ymm0
-; SKX-NEXT: retq
+; ALL-LABEL: zext_8x16_to_8x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: retq
%x = zext <8 x i16> %a to <8 x i32>
ret <8 x i32> %x
}
@@ -799,14 +794,14 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x16mem_to_16x32:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; SKX-NEXT: retq
%a = load <16 x i16>,<16 x i16> *%i,align 1
%x = zext <16 x i16> %a to <16 x i32>
@@ -850,14 +845,14 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x16_to_16x32mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
-; SKX-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; SKX-NEXT: retq
%x = zext <16 x i16> %a to <16 x i32>
%ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
@@ -867,7 +862,7 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun
define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
; ALL-LABEL: zext_16x16_to_16x32:
; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwd %ymm0, %zmm0
+; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; ALL-NEXT: retq
%x = zext <16 x i16> %a to <16 x i32>
ret <16 x i32> %x
@@ -886,8 +881,8 @@ define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind
; SKX-LABEL: zext_2x16mem_to_2x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
%x = zext <2 x i16> %a to <2 x i64>
@@ -908,7 +903,7 @@ define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounw
; SKX-LABEL: sext_2x16mem_to_2x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
@@ -940,8 +935,8 @@ define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x16mem_to_4x64:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
%x = zext <4 x i16> %a to <4 x i64>
@@ -962,7 +957,7 @@ define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounw
; SKX-LABEL: sext_4x16mem_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
@@ -987,14 +982,14 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16mem_to_8x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxwq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; SKX-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
%x = zext <8 x i16> %a to <8 x i64>
@@ -1039,14 +1034,14 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16_to_8x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
-; SKX-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: retq
%x = zext <8 x i16> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
@@ -1056,7 +1051,7 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind
define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
; ALL-LABEL: zext_8x16_to_8x64:
; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwq %xmm0, %zmm0
+; ALL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; ALL-NEXT: retq
%ret = zext <8 x i16> %a to <8 x i64>
ret <8 x i64> %ret
@@ -1075,8 +1070,8 @@ define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind
; SKX-LABEL: zext_2x32mem_to_2x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
-; SKX-NEXT: vpmovzxdq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero
; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
%x = zext <2 x i32> %a to <2 x i64>
@@ -1097,7 +1092,7 @@ define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounw
; SKX-LABEL: sext_2x32mem_to_2x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k1
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
@@ -1129,8 +1124,8 @@ define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x32mem_to_4x64:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: vpmovzxdq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
%x = zext <4 x i32> %a to <4 x i64>
@@ -1151,7 +1146,7 @@ define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounw
; SKX-LABEL: sext_4x32mem_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z}
; SKX-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
@@ -1192,8 +1187,8 @@ define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind
; SKX-LABEL: zext_4x32_to_4x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
-; SKX-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SKX-NEXT: retq
%x = zext <4 x i32> %a to <4 x i64>
%ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
@@ -1206,14 +1201,14 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x32mem_to_8x64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
-; SKX-NEXT: vpmovzxdq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SKX-NEXT: retq
%a = load <8 x i32>,<8 x i32> *%i,align 1
%x = zext <8 x i32> %a to <8 x i64>
@@ -1267,14 +1262,14 @@ define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x32_to_8x64mask:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
-; SKX-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; SKX-NEXT: retq
%x = zext <8 x i32> %a to <8 x i64>
%ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
@@ -1312,8 +1307,7 @@ define <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
define <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
; KNL-LABEL: zext_8i1_to_8xi64:
; KNL: ## BB#0:
-; KNL-NEXT: movzbl %dil, %eax
-; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
@@ -1334,6 +1328,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_16i8_to_16i1:
@@ -1341,6 +1336,7 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%mask_b = trunc <16 x i8>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
@@ -1348,19 +1344,13 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
}
define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
-; KNL-LABEL: trunc_16i32_to_16i1:
-; KNL: ## BB#0:
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_16i32_to_16i1:
-; SKX: ## BB#0:
-; SKX-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX-NEXT: vpmovd2m %zmm0, %k0
-; SKX-NEXT: kmovw %k0, %eax
-; SKX-NEXT: retq
+; ALL-LABEL: trunc_16i32_to_16i1:
+; ALL: ## BB#0:
+; ALL-NEXT: vpslld $31, %zmm0, %zmm0
+; ALL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
%mask_b = trunc <16 x i32>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
ret i16 %mask
@@ -1377,10 +1367,9 @@ define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
; SKX-LABEL: trunc_4i32_to_4i1:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpslld $31, %xmm1, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k1
-; SKX-NEXT: kandw %k1, %k0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1}
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
%mask_a = trunc <4 x i32>%a to <4 x i1>
@@ -1398,6 +1387,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_8i16_to_8i1:
@@ -1405,6 +1395,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%mask_b = trunc <8 x i16>%a to <8 x i1>
%mask = bitcast <8 x i1> %mask_b to i8
@@ -1414,9 +1405,12 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL-LABEL: sext_8i1_8i32:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; KNL-NEXT: knotw %k0, %k1
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: retq
;
@@ -1442,6 +1436,7 @@ define i16 @trunc_i32_to_i1(i32 %a) {
; ALL-NEXT: kmovw %eax, %k1
; ALL-NEXT: korw %k0, %k1, %k0
; ALL-NEXT: kmovw %k0, %eax
+; ALL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; ALL-NEXT: retq
%a_i = trunc i32 %a to i1
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
@@ -1454,6 +1449,7 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL: ## BB#0:
; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8i1_8i16:
@@ -1470,7 +1466,8 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
; KNL-LABEL: sext_16i1_16i32:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16i1_16i32:
@@ -1532,265 +1529,264 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %eax, %xmm4
+; KNL-NEXT: vmovd %r15d, %xmm4
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftlw $13, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm4, %xmm4
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
-; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $6, %esi, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $7, %edi, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %edi
+; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $10, %r10d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $11, %r11d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $12, %ebx, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %ebp, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $2, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $1, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm4, %xmm4
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT: kshiftlw $0, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %eax, %xmm5
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm5, %xmm5
; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $13, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $3, %edi, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %edi
+; KNL-NEXT: vpinsrb $3, %edx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r12d
; KNL-NEXT: kshiftlw $12, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $4, %esi, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $11, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $5, %r13d, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
; KNL-NEXT: kmovw %k0, %r13d
; KNL-NEXT: kshiftlw $10, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $6, %r8d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: vpinsrb $6, %esi, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
; KNL-NEXT: kshiftlw $9, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $7, %r10d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vpinsrb $7, %edi, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: kshiftlw $8, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $8, %r11d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %edi
; KNL-NEXT: kshiftlw $7, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $9, %ebx, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r8d
; KNL-NEXT: kshiftlw $6, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: vpinsrb $10, %r10d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r9d
; KNL-NEXT: kshiftlw $5, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $11, %r14d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %ebx
; KNL-NEXT: kshiftlw $4, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $12, %r15d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: vpinsrb $12, %ebp, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %ebp
; KNL-NEXT: kshiftlw $3, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %r9d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: vpinsrb $13, %r11d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: kshiftlw $2, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $14, %r12d, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r11d
; KNL-NEXT: kshiftlw $1, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: vptestmd %zmm6, %zmm6, %k0
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm5, %xmm5
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: vptestmd %zmm7, %zmm7, %k0
; KNL-NEXT: kshiftlw $0, %k1, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %ecx, %xmm5
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: vmovd %eax, %xmm6
+; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $2, %edi, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $3, %esi, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %edi
+; KNL-NEXT: vpinsrb $3, %edx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $4, %r13d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vpinsrb $4, %r13d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $5, %r8d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $6, %r10d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: vpinsrb $6, %esi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $7, %r11d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %esi
-; KNL-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; KNL-NEXT: vpinsrb $7, %edi, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %edi
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $8, %ebx, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: vpinsrb $8, %r8d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $9, %ebp, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: vpinsrb $9, %r9d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $10, %r14d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: vpinsrb $10, %ebx, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $11, %r15d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: vpinsrb $11, %ebp, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $13, %r12d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: vpinsrb $13, %r11d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm5, %xmm5
-; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vpinsrb $14, %r14d, %xmm6, %xmm6
+; KNL-NEXT: kmovw %k1, %r14d
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $15, %edx, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6
; KNL-NEXT: kmovw %k1, %r15d
-; KNL-NEXT: vptestmd %zmm7, %zmm7, %k1
; KNL-NEXT: kshiftlw $0, %k0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %eax, %xmm6
+; KNL-NEXT: vmovd %r12d, %xmm7
; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $14, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $15, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $2, %edi, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: kshiftlw $13, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: kshiftlw $12, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $4, %r8d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: kshiftlw $11, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $5, %r13d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r13d
-; KNL-NEXT: kshiftlw $10, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
-; KNL-NEXT: kmovw %k0, %edi
-; KNL-NEXT: kshiftlw $9, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $7, %ebx, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %ebx
-; KNL-NEXT: kshiftlw $8, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $8, %ebp, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %ebp
-; KNL-NEXT: kshiftlw $7, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $9, %r10d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: kshiftlw $6, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $10, %r11d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r11d
-; KNL-NEXT: kshiftlw $5, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $11, %esi, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: kshiftlw $4, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r14d
-; KNL-NEXT: kshiftlw $3, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %r9d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: kshiftlw $2, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $14, %r15d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r15d
-; KNL-NEXT: kshiftlw $1, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $15, %r12d, %xmm6, %xmm6
-; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $0, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %edx, %xmm7
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vpinsrb $1, %eax, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $3, %r8d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $5, %edi, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $6, %ebx, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $7, %ebp, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $9, %r11d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $10, %esi, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $11, %r14d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $12, %r9d, %xmm7, %xmm7
-; KNL-NEXT: vpinsrb $13, %r15d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $1, %ecx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $2, %edx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $3, %r13d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $4, %eax, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $5, %esi, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $6, %edi, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $7, %r8d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $8, %r9d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $9, %ebx, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $11, %r10d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $12, %r11d, %xmm7, %xmm7
+; KNL-NEXT: vpinsrb $13, %r14d, %xmm7, %xmm7
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
@@ -1803,8 +1799,8 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2
-; KNL-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm4
-; KNL-NEXT: vpinsrb $15, %edx, %xmm4, %xmm4
+; KNL-NEXT: vpinsrb $14, %r15d, %xmm7, %xmm4
+; KNL-NEXT: vpinsrb $15, %r12d, %xmm4, %xmm4
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; KNL-NEXT: vpsllw $15, %ymm4, %ymm4
; KNL-NEXT: vpsraw $15, %ymm4, %ymm4
@@ -1821,15 +1817,206 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %zmm2, %zmm2
; SKX-NEXT: vpmovb2m %zmm2, %k1
-; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; SKX-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; SKX-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: kshiftrq $32, %k1, %k1
-; SKX-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm3, %zmm0
-; SKX-NEXT: vmovaps %zmm2, %zmm1
+; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z}
; SKX-NEXT: retq
%ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
ret <64 x i16> %ret
}
+define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone {
+; ALL-LABEL: shuffle_zext_16x8_to_16x16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16, i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 13, i32 16, i32 14, i32 16, i32 15, i32 16>
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) {
+; ALL-LABEL: zext_32x8_to_16x16:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 8, i32 32, i32 9, i32 32, i32 10, i32 32, i32 11, i32 32, i32 12, i32 32, i32 13, i32 32, i32 14, i32 32, i32 15, i32 32>
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) {
+; ALL-LABEL: zext_32x8_to_8x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; ALL-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
+ %2 = bitcast <32 x i8> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) {
+; ALL-LABEL: zext_32x8_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) {
+; ALL-LABEL: zext_16x16_to_8x32:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16>
+ %2 = bitcast <16 x i16> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) {
+; ALL-LABEL: zext_16x16_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; ALL-NEXT: retq
+ %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
+ %2 = bitcast <16 x i16> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) {
+; ALL-LABEL: zext_8x32_to_4x64:
+; ALL: ## BB#0:
+; ALL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+ %2 = bitcast <8 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
+; KNL-LABEL: zext_64xi1_to_64xi8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_64xi1_to_64xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <64 x i8> %x, %y
+ %1 = zext <64 x i1> %mask to <64 x i8>
+ ret <64 x i8> %1
+}
+
+define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
+; KNL-LABEL: zext_32xi1_to_32xi16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_32xi1_to_32xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <32 x i16> %x, %y
+ %1 = zext <32 x i1> %mask to <32 x i16>
+ ret <32 x i16> %1
+}
+
+define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 {
+; KNL-LABEL: zext_16xi1_to_16xi16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_16xi1_to_16xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <16 x i16> %x, %y
+ %1 = zext <16 x i1> %mask to <16 x i16>
+ ret <16 x i16> %1
+}
+
+
+define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
+; KNL-LABEL: zext_32xi1_to_32xi8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_32xi1_to_32xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <32 x i16> %x, %y
+ %1 = zext <32 x i1> %mask to <32 x i8>
+ ret <32 x i8> %1
+}
+
+define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
+; KNL-LABEL: zext_4xi1_to_4x32:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
+; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpsrld $31, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_4xi1_to_4x32:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SKX-NEXT: vpandq %xmm2, %xmm1, %xmm1
+; SKX-NEXT: vpandq %xmm2, %xmm0, %xmm0
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <4 x i8> %x, %y
+ %1 = zext <4 x i1> %mask to <4 x i32>
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
+; KNL-LABEL: zext_2xi1_to_2xi64:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
+; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: zext_2xi1_to_2xi64:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SKX-NEXT: vpandq %xmm2, %xmm1, %xmm1
+; SKX-NEXT: vpandq %xmm2, %xmm0, %xmm0
+; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <2 x i8> %x, %y
+ %1 = zext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %1
+}
diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll
index 703f7832588c..8bd57c0fc1da 100644
--- a/test/CodeGen/X86/avx512-extract-subvector.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector.ll
@@ -14,6 +14,7 @@ define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind {
define <8 x i16> @extract_subvector128_v32i16_first_element(<32 x i16> %x) nounwind {
; SKX-LABEL: extract_subvector128_v32i16_first_element:
; SKX: ## BB#0:
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %r1
@@ -31,6 +32,7 @@ define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind {
define <16 x i8> @extract_subvector128_v64i8_first_element(<64 x i8> %x) nounwind {
; SKX-LABEL: extract_subvector128_v64i8_first_element:
; SKX: ## BB#0:
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %r1
@@ -54,3 +56,291 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind {
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
ret <32 x i8> %r1
}
+
+define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8f64_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextractf64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+ %1 = bitcast double* %addr to <2 x double>*
+ store <2 x double> %0, <2 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8f32_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextractf32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast float* %addr to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4i64_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %1 = bitcast i64* %addr to <2 x i64>*
+ store <2 x i64> %0, <2 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8i32_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i32* %addr to <4 x i32>*
+ store <4 x i32> %0, <4 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v16i16_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i16* %addr to <8 x i16>*
+ store <8 x i16> %0, <8 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v32i8_store:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %1 = bitcast i8* %addr to <16 x i8>*
+ store <16 x i8> %0, <16 x i8>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4f64_store_lo(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4f64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovupd %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast double* %addr to <2 x double>*
+ store <2 x double> %0, <2 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4f32_store_lo(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4f32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast float* %addr to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v2i64_store_lo(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v2i64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu64 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast i64* %addr to <2 x i64>*
+ store <2 x i64> %0, <2 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v4i32_store_lo(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v4i32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast i32* %addr to <4 x i32>*
+ store <4 x i32> %0, <4 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v8i16_store_lo(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v8i16_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i16* %addr to <8 x i16>*
+ store <8 x i16> %0, <8 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector256_v16i8_store_lo(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector256_v16i8_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i8* %addr to <16 x i8>*
+ store <16 x i8> %0, <16 x i8>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v2f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v2f64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovupd %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast double* %addr to <2 x double>*
+ store <2 x double> %0, <2 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4f32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovups %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast float* %addr to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v2i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v2i64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu64 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %1 = bitcast i64* %addr to <2 x i64>*
+ store <2 x i64> %0, <2 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4i32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast i32* %addr to <4 x i32>*
+ store <4 x i32> %0, <4 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v8i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v8i16_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i16* %addr to <8 x i16>*
+ store <8 x i16> %0, <8 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v16i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v16i8_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %xmm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i8* %addr to <16 x i8>*
+ store <16 x i8> %0, <16 x i8>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4f64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovupd %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast double* %addr to <4 x double>*
+ store <4 x double> %0, <4 x double>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v8f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v8f32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovups %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast float* %addr to <8 x float>*
+ store <8 x float> %0, <8 x float>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v4i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v4i64_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu64 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = bitcast i64* %addr to <4 x i64>*
+ store <4 x i64> %0, <4 x i64>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v8i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v8i32_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %1 = bitcast i32* %addr to <8 x i32>*
+ store <8 x i32> %0, <8 x i32>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v16i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v16i16_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %1 = bitcast i16* %addr to <16 x i16>*
+ store <16 x i16> %0, <16 x i16>* %1, align 1
+ ret void
+}
+
+define void @extract_subvector512_v32i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
+; SKX-LABEL: extract_subvector512_v32i8_store_lo:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: vmovdqu32 %ymm0, (%rdi)
+; SKX-NEXT: retq
+entry:
+ %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %1 = bitcast i8* %addr to <32 x i8>*
+ store <32 x i8> %0, <32 x i8>* %1, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index c30fc909f09b..d8026cd987c2 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1,78 +1,104 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_x86_vfnmadd_ps_z
- ; CHECK: vfnmadd213ps %zmm
+; CHECK-LABEL: test_x86_vfnmadd_ps_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd_ps
- ; CHECK: vfnmadd213ps %zmm
+; CHECK-LABEL: test_mask_vfnmadd_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfnmadd_pd_z
- ; CHECK: vfnmadd213pd %zmm
+; CHECK-LABEL: test_x86_vfnmadd_pd_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd_pd
- ; CHECK: vfnmadd213pd %zmm
+; CHECK-LABEL: test_mask_vfnmadd_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_x86_vfnmsubps_z
- ; CHECK: vfnmsub213ps %zmm
+; CHECK-LABEL: test_x86_vfnmsubps_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub_ps
- ; CHECK: vfnmsub213ps %zmm
+; CHECK-LABEL: test_mask_vfnmsub_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfnmsubpd_z
- ; CHECK: vfnmsub213pd %zmm
+; CHECK-LABEL: test_x86_vfnmsubpd_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub_pd
- ; CHECK: vfnmsub213pd %zmm
+; CHECK-LABEL: test_mask_vfnmsub_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_x86_vfmaddsubps_z
- ; CHECK: vfmaddsub213ps %zmm
+; CHECK-LABEL: test_x86_vfmaddsubps_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
; CHECK-LABEL: test_mask_fmaddsub_ps:
-; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
ret <16 x float> %res
}
@@ -80,16 +106,21 @@ define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16
declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_x86_vfmaddsubpd_z
- ; CHECK: vfmaddsub213pd %zmm
+; CHECK-LABEL: test_x86_vfmaddsubpd_z:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmaddsub_pd
- ; CHECK: vfmaddsub213pd %zmm
+; CHECK-LABEL: test_mask_vfmaddsub_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
@@ -97,8 +128,7 @@ define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1,
define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -115,8 +145,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -133,8 +162,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -200,8 +228,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -231,71 +258,96 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0,
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne
- ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn
- ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp
- ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz
- ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current
- ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne
- ; CHECK: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn
- ; CHECK: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp
- ; CHECK: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz
- ; CHECK: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
ret <16 x float> %res
}
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current
- ; CHECK: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
ret <16 x float> %res
}
@@ -305,8 +357,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -336,71 +387,96 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <1
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne
- ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn
- ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp
- ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz
- ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current
- ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne
- ; CHECK: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn
- ; CHECK: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp
- ; CHECK: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz
- ; CHECK: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current
- ; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
@@ -408,8 +484,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0,
define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -426,8 +501,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -444,8 +518,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -508,71 +581,96 @@ define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <1
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne
- ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn
- ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp
- ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz
- ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current
- ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne
- ; CHECK: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn
- ; CHECK: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp
- ; CHECK: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz
- ; CHECK: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
ret <8 x double> %res
}
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
- ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current
- ; CHECK: vfnmsub213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
ret <8 x double> %res
}
@@ -580,8 +678,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0
define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -598,8 +695,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double>, <8 x do
define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -646,8 +742,7 @@ define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <
define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
index 9279441a23c7..b2d08355a851 100644
--- a/test/CodeGen/X86/avx512-fma.ll
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -67,11 +67,17 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8
}
define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
-; ALL-LABEL: test_x86_fmsub_213:
-; ALL: ## BB#0:
-; ALL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; ALL-NEXT: vmovaps %zmm1, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: test_x86_fmsub_213:
+; KNL: ## BB#0:
+; KNL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmsub_213:
+; SKX: ## BB#0:
+; SKX-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
ret double %res
@@ -86,7 +92,8 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
;
; SKX-LABEL: test_x86_fmsub_213_m:
; SKX: ## BB#0:
-; SKX-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0
+; SKX-NEXT: vfmsub213sd (%rdi), %xmm0, %xmm1
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
%a2 = load double , double *%a2_ptr
%x = fmul double %a0, %a1
@@ -95,11 +102,17 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
}
define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
-; ALL-LABEL: test_x86_fmsub_231_m:
-; ALL: ## BB#0:
-; ALL-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
-; ALL-NEXT: vmovaps %zmm1, %zmm0
-; ALL-NEXT: retq
+; KNL-LABEL: test_x86_fmsub_231_m:
+; KNL: ## BB#0:
+; KNL-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_x86_fmsub_231_m:
+; SKX: ## BB#0:
+; SKX-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%a2 = load double , double *%a2_ptr
%x = fmul double %a0, %a2
%res = fsub double %x, %a1
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 9ba18192f5d2..d6bc66b591b2 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -183,7 +183,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %edi, %k1
; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
ret <8 x float> %res;
@@ -281,7 +281,7 @@ define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -314,7 +314,7 @@ define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm0 {%k1}
@@ -332,7 +332,7 @@ define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x
; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
@@ -350,7 +350,7 @@ define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -369,7 +369,7 @@ define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
@@ -386,7 +386,7 @@ define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1}
@@ -404,7 +404,7 @@ define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x
; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
@@ -422,7 +422,7 @@ define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -455,7 +455,7 @@ define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm0 {%k1}
@@ -488,7 +488,7 @@ define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm0 {%k1}
@@ -507,7 +507,7 @@ define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %xmm0, %xmm2
; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
@@ -524,7 +524,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1,
; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm0 {%k1}
@@ -542,7 +542,7 @@ define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x
; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovb %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm2
+; CHECK-NEXT: vmovaps %ymm0, %ymm2
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1}
diff --git a/test/CodeGen/X86/avx512-inc-dec.ll b/test/CodeGen/X86/avx512-inc-dec.ll
index f04ca878f434..5183c9d0fb8f 100644
--- a/test/CodeGen/X86/avx512-inc-dec.ll
+++ b/test/CodeGen/X86/avx512-inc-dec.ll
@@ -2,7 +2,7 @@
;CHECK-LABEL: test
;CHECK-NOT: dec
-;CHECK_NOT: enc
+;CHECK-NOT: enc
;CHECK: ret
define i32 @test(i32 %a, i32 %b) {
%a1 = add i32 %a, -1
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 41ec62c7e047..2c42aca33e45 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,11 +1,25 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
-;CHECK-LABEL: test1:
-;CHECK: vinsertps
-;CHECK: vinsertf32x4
-;CHECK: ret
define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
+; KNL-LABEL: test1:
+; KNL: ## BB#0:
+; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; KNL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test1:
+; SKX: ## BB#0:
+; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; SKX-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; SKX-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: retq
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -15,19 +29,19 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; KNL-LABEL: test2:
; KNL: ## BB#0:
-; KNL-NEXT: vmovhpd (%rdi), %xmm0, %xmm2
+; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; KNL-NEXT: vmovsd %xmm1, %xmm2, %xmm1
+; KNL-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
; KNL-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test2:
; SKX: ## BB#0:
-; SKX-NEXT: vmovhpd (%rdi), %xmm0, %xmm2
+; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; SKX-NEXT: vinsertf64x2 $0, %xmm2, %zmm0, %zmm0
; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm2
-; SKX-NEXT: vmovsd %xmm1, %xmm2, %xmm1
+; SKX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
; SKX-NEXT: vinsertf64x2 $3, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%rrr = load double, double* %br
@@ -36,11 +50,20 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
ret <8 x double> %rrr3
}
-;CHECK-LABEL: test3:
-;CHECK: vextractf32x4 $1
-;CHECK: vinsertf32x4 $0
-;CHECK: ret
define <16 x float> @test3(<16 x float> %x) nounwind {
+; KNL-LABEL: test3:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractf32x4 $1, %zmm0, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
+; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test3:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm1
+; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
+; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT: retq
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
@@ -67,70 +90,140 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
ret <8 x i64> %rrr2
}
-;CHECK-LABEL: test5:
-;CHECK: vextractps
-;CHECK: ret
define i32 @test5(<4 x float> %x) nounwind {
+; KNL-LABEL: test5:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractps $3, %xmm0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test5:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractps $3, %xmm0, %eax
+; SKX-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
}
-;CHECK-LABEL: test6:
-;CHECK: vextractps {{.*}}, (%rdi)
-;CHECK: ret
define void @test6(<4 x float> %x, float* %out) nounwind {
+; KNL-LABEL: test6:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractps $3, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test6:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractps $3, %xmm0, (%rdi)
+; SKX-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
}
-;CHECK-LABEL: test7
-;CHECK: vmovd
-;CHECK: vpermps %zmm
-;CHECK: ret
define float @test7(<16 x float> %x, i32 %ind) nounwind {
+; KNL-LABEL: test7:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test7:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
-;CHECK-LABEL: test8
-;CHECK: vmovq
-;CHECK: vpermpd %zmm
-;CHECK: ret
define double @test8(<8 x double> %x, i32 %ind) nounwind {
+; KNL-LABEL: test8:
+; KNL: ## BB#0:
+; KNL-NEXT: movslq %edi, %rax
+; KNL-NEXT: vmovq %rax, %xmm1
+; KNL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test8:
+; SKX: ## BB#0:
+; SKX-NEXT: movslq %edi, %rax
+; SKX-NEXT: vmovq %rax, %xmm1
+; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
-;CHECK-LABEL: test9
-;CHECK: vmovd
-;CHECK: vpermps %ymm
-;CHECK: ret
define float @test9(<8 x float> %x, i32 %ind) nounwind {
+; KNL-LABEL: test9:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test9:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; SKX-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
-;CHECK-LABEL: test10
-;CHECK: vmovd
-;CHECK: vpermd %zmm
-;CHECK: vmovd %xmm0, %eax
-;CHECK: ret
define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
+; KNL-LABEL: test10:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovd %edi, %xmm1
+; KNL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test10:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
-;CHECK-LABEL: test11
-;CHECK: vpcmpltud
-;CHECK: kshiftlw $11
-;CHECK: kshiftrw $15
-;CHECK: testb
-;CHECK: je
-;CHECK: ret
-;CHECK: ret
define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
+; KNL-LABEL: test11:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; KNL-NEXT: kshiftlw $11, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: je LBB10_2
+; KNL-NEXT: ## BB#1: ## %A
+; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: retq
+; KNL-NEXT: LBB10_2: ## %B
+; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test11:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; SKX-NEXT: kshiftlw $11, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: je LBB10_2
+; SKX-NEXT: ## BB#1: ## %A
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+; SKX-NEXT: LBB10_2: ## %B
+; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; SKX-NEXT: retq
%cmp_res = icmp ult <16 x i32> %a, %b
%ia = extractelement <16 x i1> %cmp_res, i32 4
br i1 %ia, label %A, label %B
@@ -141,73 +234,144 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
ret <16 x i32>%c
}
-;CHECK-LABEL: test12
-;CHECK: vpcmpgtq
-;CHECK: kshiftlw $15
-;CHECK: kshiftrw $15
-;CHECK: testb
-;CHECK: ret
-
define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
-
+; KNL-LABEL: test12:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
+; KNL-NEXT: vpcmpgtq %zmm1, %zmm3, %k1
+; KNL-NEXT: kunpckbw %k0, %k1, %k0
+; KNL-NEXT: kshiftlw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: cmoveq %rsi, %rdi
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test12:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
+; SKX-NEXT: vpcmpgtq %zmm1, %zmm3, %k1
+; SKX-NEXT: kunpckbw %k0, %k1, %k0
+; SKX-NEXT: kshiftlw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: cmoveq %rsi, %rdi
+; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <16 x i64> %a, %b
%extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
ret i64 %res
}
-;CHECK-LABEL: test13
-;CHECK: cmpl %esi, %edi
-;CHECK: setb %al
-;CHECK: andl $1, %eax
-;CHECK: kmovw %eax, %k0
-;CHECK: movw $-4
-;CHECK: korw
define i16 @test13(i32 %a, i32 %b) {
+; KNL-LABEL: test13:
+; KNL: ## BB#0:
+; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: setb %al
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: movw $-4, %ax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test13:
+; SKX: ## BB#0:
+; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: setb %al
+; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: movw $-4, %ax
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: korw %k0, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq
%cmp_res = icmp ult i32 %a, %b
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
%res = bitcast <16 x i1> %maskv to i16
ret i16 %res
}
-;CHECK-LABEL: test14
-;CHECK: vpcmpgtq
-;KNL: kshiftlw $11
-;KNL: kshiftrw $15
-;KNL: testb
-;SKX: kshiftlb $3
-;SKX: kshiftrb $7
-;SKX: testb
-;CHECK: ret
-
define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
-
+; KNL-LABEL: test14:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT: kshiftlw $11, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: cmoveq %rsi, %rdi
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test14:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
+; SKX-NEXT: kshiftlb $3, %k0, %k0
+; SKX-NEXT: kshiftrb $7, %k0, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: cmoveq %rsi, %rdi
+; SKX-NEXT: movq %rdi, %rax
+; SKX-NEXT: retq
%cmpvector_func.i = icmp slt <8 x i64> %a, %b
%extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
%res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
ret i64 %res
}
-;CHECK-LABEL: test15
-;CHECK: movb (%rdi), %al
-;CHECK: andb $1, %al
-;CHECK: movw $-1, %ax
-;CHECK: cmovew
define i16 @test15(i1 *%addr) {
+; KNL-LABEL: test15:
+; KNL: ## BB#0:
+; KNL-NEXT: movb (%rdi), %al
+; KNL-NEXT: xorl %ecx, %ecx
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: movw $-1, %ax
+; KNL-NEXT: cmovew %cx, %ax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test15:
+; SKX: ## BB#0:
+; SKX-NEXT: movb (%rdi), %al
+; SKX-NEXT: xorl %ecx, %ecx
+; SKX-NEXT: testb %al, %al
+; SKX-NEXT: movw $-1, %ax
+; SKX-NEXT: cmovew %cx, %ax
+; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 1
%x1 = insertelement <16 x i1> undef, i1 %x, i32 10
%x2 = bitcast <16 x i1>%x1 to i16
ret i16 %x2
}
-;CHECK-LABEL: test16
-;CHECK: movb (%rdi), %al
-;CHECK: andw $1, %ax
-;CHECK: kmovw
-;CHECK: kshiftlw $10
-;CHECK: korw
-;CHECK: ret
define i16 @test16(i1 *%addr, i16 %a) {
+; KNL-LABEL: test16:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %esi, %k1
+; KNL-NEXT: kshiftlw $10, %k0, %k0
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test16:
+; SKX: ## BB#0:
+; SKX-NEXT: movzbl (%rdi), %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: kmovd %eax, %k0
+; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kshiftlw $10, %k0, %k0
+; SKX-NEXT: korw %k0, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i16 %a to <16 x i1>
%x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
@@ -215,15 +379,30 @@ define i16 @test16(i1 *%addr, i16 %a) {
ret i16 %x2
}
-;CHECK-LABEL: test17
-;KNL: movb (%rdi), %al
-;KNL: andw $1, %ax
-;KNL: kshiftlw $4
-;KNL: korw
-;SKX: kshiftlb $4
-;SKX: korb
-;CHECK: ret
define i8 @test17(i1 *%addr, i8 %a) {
+; KNL-LABEL: test17:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %esi, %k1
+; KNL-NEXT: kshiftlw $4, %k0, %k0
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test17:
+; SKX: ## BB#0:
+; SKX-NEXT: movzbl (%rdi), %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: kmovd %eax, %k0
+; SKX-NEXT: kmovb %esi, %k1
+; SKX-NEXT: kshiftlb $4, %k0, %k0
+; SKX-NEXT: korb %k0, %k1, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i8 %a to <8 x i1>
%x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
@@ -232,6 +411,13 @@ define i8 @test17(i1 *%addr, i8 %a) {
}
define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
+; KNL-LABEL: extract_v8i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v8i64:
; SKX: ## BB#0:
; SKX-NEXT: vpextrq $1, %xmm0, %rax
@@ -245,10 +431,17 @@ define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
}
define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
+; KNL-LABEL: extract_v4i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrq $1, %xmm0, %rax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v4i64:
; SKX: ## BB#0:
; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
; SKX-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
@@ -258,6 +451,12 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
}
define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
+; KNL-LABEL: extract_v2i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovq %xmm0, %rax
+; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v2i64:
; SKX: ## BB#0:
; SKX-NEXT: vmovq %xmm0, %rax
@@ -270,6 +469,13 @@ define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
}
define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
+; KNL-LABEL: extract_v16i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrd $1, %xmm0, %eax
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v16i32:
; SKX: ## BB#0:
; SKX-NEXT: vpextrd $1, %xmm0, %eax
@@ -283,10 +489,17 @@ define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
}
define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
+; KNL-LABEL: extract_v8i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrd $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v8i32:
; SKX: ## BB#0:
; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
; SKX-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
@@ -296,6 +509,12 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
}
define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
+; KNL-LABEL: extract_v4i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrd $1, %xmm0, %eax
+; KNL-NEXT: vpextrd $3, %xmm0, (%rdi)
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v4i32:
; SKX: ## BB#0:
; SKX-NEXT: vpextrd $1, %xmm0, %eax
@@ -308,11 +527,20 @@ define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
}
define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
+; KNL-LABEL: extract_v32i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrw $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v32i16:
; SKX: ## BB#0:
; SKX-NEXT: vpextrw $1, %xmm0, %eax
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <32 x i16> %x, i32 1
%r2 = extractelement <32 x i16> %x, i32 9
@@ -321,11 +549,20 @@ define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
}
define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
+; KNL-LABEL: extract_v16i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrw $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v16i16:
; SKX: ## BB#0:
; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <16 x i16> %x, i32 1
%r2 = extractelement <16 x i16> %x, i32 9
@@ -334,10 +571,18 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
}
define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
+; KNL-LABEL: extract_v8i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrw $1, %xmm0, %eax
+; KNL-NEXT: vpextrw $3, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v8i16:
; SKX: ## BB#0:
; SKX-NEXT: vpextrw $1, %xmm0, %eax
; SKX-NEXT: vpextrw $3, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <8 x i16> %x, i32 1
%r2 = extractelement <8 x i16> %x, i32 3
@@ -346,11 +591,20 @@ define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
}
define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
+; KNL-LABEL: extract_v64i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrb $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v64i8:
; SKX: ## BB#0:
; SKX-NEXT: vpextrb $1, %xmm0, %eax
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <64 x i8> %x, i32 1
%r2 = extractelement <64 x i8> %x, i32 17
@@ -359,11 +613,20 @@ define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
}
define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
+; KNL-LABEL: extract_v32i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrb $1, %xmm0, %eax
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v32i8:
; SKX: ## BB#0:
; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <32 x i8> %x, i32 1
%r2 = extractelement <32 x i8> %x, i32 17
@@ -372,10 +635,18 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
}
define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
+; KNL-LABEL: extract_v16i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpextrb $1, %xmm0, %eax
+; KNL-NEXT: vpextrb $3, %xmm0, (%rdi)
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: extract_v16i8:
; SKX: ## BB#0:
; SKX-NEXT: vpextrb $1, %xmm0, %eax
; SKX-NEXT: vpextrb $3, %xmm0, (%rdi)
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%r1 = extractelement <16 x i8> %x, i32 1
%r2 = extractelement <16 x i8> %x, i32 3
@@ -384,6 +655,15 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
}
define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
+; KNL-LABEL: insert_v8i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; KNL-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
+; KNL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v8i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
@@ -399,11 +679,20 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
}
define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
+; KNL-LABEL: insert_v4i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v4i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -414,6 +703,12 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
}
define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
+; KNL-LABEL: insert_v2i64:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
+; KNL-NEXT: vpinsrq $3, %rdi, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v2i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
@@ -426,6 +721,15 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
}
define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
+; KNL-LABEL: insert_v16i32:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm1
+; KNL-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
+; KNL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v16i32:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
@@ -454,7 +758,7 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -520,7 +824,7 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -573,11 +877,20 @@ define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
}
define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
+; KNL-LABEL: insert_v32i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: insert_v32i8:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -646,7 +959,7 @@ define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
;
; SKX-LABEL: test_insert_128_v8f64:
; SKX: ## BB#0:
-; SKX-NEXT: vunpcklpd %xmm1, %xmm0, %xmm1
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; SKX-NEXT: vinsertf64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%r = insertelement <8 x double> %x, double %y, i32 1
@@ -656,13 +969,13 @@ define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
; KNL-LABEL: test_insert_128_v16f32:
; KNL: ## BB#0:
-; KNL-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm1
+; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_insert_128_v16f32:
; SKX: ## BB#0:
-; SKX-NEXT: vinsertps $16, %xmm1, %xmm0, %xmm1
+; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%r = insertelement <16 x float> %x, float %y, i32 1
@@ -679,7 +992,7 @@ define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
;
; SKX-LABEL: test_insert_128_v16i16:
; SKX: ## BB#0:
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
@@ -697,7 +1010,7 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
;
; SKX-LABEL: test_insert_128_v32i8:
; SKX: ## BB#0:
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-intel-ocl.ll b/test/CodeGen/X86/avx512-intel-ocl.ll
index 2e1b27e4aecf..69e06f547ced 100644
--- a/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -15,7 +15,8 @@ declare i32 @func_int(i32, i32)
; X32-LABEL: testf16_inp
; X32: vaddps {{.*}}, {{%zmm[0-1]}}
-; X32: movl %eax, (%esp)
+; Push is not deemed profitable if we're realigning the stack.
+; X32: {{pushl|movl}} %eax
; X32: call
; X32: ret
@@ -68,10 +69,10 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
; WIN64: vmovups {{.*(%rbp).*}}, %zmm21 # 64-byte Reload
; X64-LABEL: test_prolog_epilog
-; X64: kmovw %k7, {{.*}}(%rsp) ## 8-byte Folded Spill
-; X64: kmovw %k6, {{.*}}(%rsp) ## 8-byte Folded Spill
-; X64: kmovw %k5, {{.*}}(%rsp) ## 8-byte Folded Spill
-; X64: kmovw %k4, {{.*}}(%rsp) ## 8-byte Folded Spill
+; X64: kmovq %k7, {{.*}}(%rsp) ## 8-byte Spill
+; X64: kmovq %k6, {{.*}}(%rsp) ## 8-byte Spill
+; X64: kmovq %k5, {{.*}}(%rsp) ## 8-byte Spill
+; X64: kmovq %k4, {{.*}}(%rsp) ## 8-byte Spill
; X64: vmovups %zmm31, {{.*}}(%rsp) ## 64-byte Spill
; X64: vmovups %zmm16, {{.*}}(%rsp) ## 64-byte Spill
; X64: call
@@ -102,4 +103,4 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a
%mask1 = xor <16 x i1> %cmp_res, %mask
%c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
ret <16 x float> %c
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..7a0424bd2eeb
--- /dev/null
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -0,0 +1,1134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
+
+define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastd %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastd %xmm0, %zmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %zmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm512_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %zmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm512_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %zmm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x double> @test_mm512_movddup_pd(<8 x double> %a0) {
+; X32-LABEL: test_mm512_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_movddup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_mask_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_movddup_pd(i8 %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
+; X32-LABEL: test_mm512_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_mask_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
+; X32-LABEL: test_mm512_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_mask_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
+; X32-LABEL: test_mm512_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_mask_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_permute_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permute_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
+; X32-LABEL: test_mm512_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_mask_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_maskz_permute_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permute_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
+; X32-LABEL: test_mm512_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_mask_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_maskz_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+; X32-LABEL: test_mm512_mask_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_maskz_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = bitcast <16 x i32> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
+ %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
+ %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
+ %res2 = bitcast <16 x i32> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
+; X32-LABEL: test_mm512_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
+ ret <8 x double> %res1
+}
+
+define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
+ ret <8 x double> %res1
+}
+
+define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
+; X32-LABEL: test_mm512_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
+; X64-NEXT: retq
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
+ ret <16 x float> %res1
+}
+
+define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
+ ret <16 x float> %res1
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..7d0535546dfa
--- /dev/null
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -0,0 +1,1089 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
+
+define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res2, %res3
+ ret <8 x double> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res2, %res3
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res3, %res2
+ ret <8 x i64> %res4
+}
+
+define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
+; CHECK-LABEL: test_store1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovups %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+ call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
+
+define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
+; CHECK-LABEL: test_store2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovupd %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+ call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
+
+define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
+; CHECK-LABEL: test_mask_store_aligned_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovaps %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+ call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
+
+define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
+; CHECK-LABEL: test_mask_store_aligned_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovapd %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+ call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqu32 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
+
+define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, (%rsi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
+
+define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
+ %res4 = fadd <16 x float> %res2, %res1
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
+
+define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
+ %res4 = fadd <16 x float> %res2, %res1
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
+
+define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x double> %res2, %res1
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
+
+define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x double> %res2, %res1
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
+
+declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
+ %res4 = add <16 x i32> %res2, %res1
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i64> %res2, %res1
+ ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> %res, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
+ %res4 = add <16 x i32> %res2, %res1
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> %res, i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i64> %res2, %res1
+ ret <8 x i64> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+ %res2 = fadd <8 x double> %res, %res1
+ ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx512_pslli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx512_pslli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrai_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
+; CHECK-LABEL: test_x86_avx512_psrai_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
+; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
+
+define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
+; CHECK-LABEL: test_storent_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovntdq %zmm0, (%rdi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
+ ret void
+}
+
+declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
+
+define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
+; CHECK-LABEL: test_storent_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovntpd %zmm0, (%rdi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
+ ret void
+}
+
+declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>)
+
+define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
+; CHECK-LABEL: test_storent_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovntps %zmm0, (%rdi)
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
+ ret void
+}
+
+define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_xor_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+ ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_or_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_or_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+ ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_and_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_and_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+ ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_xor_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_or_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_and_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+ ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+ ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 7179f742cc66..65ed77374388 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -7,11 +7,9 @@ define i32 @test_kortestz(i16 %a0, i16 %a1) {
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kortestw %k0, %k1
; CHECK-NEXT: sete %al
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
ret i32 %res
@@ -69,6 +67,7 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) {
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: kunpckbw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
ret i16 %res
@@ -126,26 +125,6 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
}
declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
-define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rsqrt14_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrsqrt14ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rcp14_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_sqrt_pd_512:
; CHECK: ## BB#0:
@@ -424,12 +403,154 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
+; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
- %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
- ret i64 %res
+
+ %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
}
-declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
+declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtsd2si %xmm0, %rcx
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+
+ %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
+}
+declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2usi %xmm0, %rcx
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+
+ %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
+}
+declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
+
+define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2si64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2si %xmm0, %rcx
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: retq
+
+ %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
+ %res1 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 3)
+ %res2 = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 1)
+ %res3 = add i64 %res, %res1
+ %res4 = add i64 %res3, %res2
+ ret i64 %res4
+}
+declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx
+; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtsd2si %xmm0, %ecx
+; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2usi %xmm0, %ecx
+; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_avx512_cvtss2si32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtss2si %xmm0, %ecx
+; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax
+; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+
+ %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
+ %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 3)
+ %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 1)
+ %res3 = add i32 %res, %res1
+ %res4 = add i32 %res3, %res2
+ ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
; CHECK-LABEL: test_x86_vcvtph2ps_512:
@@ -482,13 +603,20 @@ define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
-
-define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
+define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
; CHECK-LABEL: test_x86_vcvtps2ph_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
+; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0
; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
+ store <16 x i16> %res1, <16 x i16> * %dst
+ %res = add <16 x i16> %res2, %res3
ret <16 x i16> %res
}
@@ -514,100 +642,6 @@ define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
}
declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
-define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
-; CHECK: kmovw %edi, %k1
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
-
- %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
-
-
-define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
-; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
-
- %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res2, %res3
- ret <8 x double> %res4
-}
-declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
-
-define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
- %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
- %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res2, %res3
- ret <16 x i32> %res4
-}
-declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_x86_pbroadcastd_i32_512(i32 %a0) {
-; CHECK-LABEL: test_x86_pbroadcastd_i32_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpbroadcastd %edi, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
-
-define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
- %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
- %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
- %res3 = add <8 x i64> %res, %res1
- %res4 = add <8 x i64> %res2, %res3
- ret <8 x i64> %res4
-}
-declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
-; CHECK-LABEL: test_x86_pbroadcastq_i64_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpbroadcastq %rdi, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
-
define <16 x i32> @test_conflict_d(<16 x i32> %a) {
; CHECK-LABEL: test_conflict_d:
; CHECK: ## BB#0:
@@ -643,8 +677,7 @@ define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_conflict_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -689,8 +722,7 @@ define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_lzcnt_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -698,70 +730,12 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
ret <8 x i64> %res
}
-define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
- ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
-
-define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vblendmpd %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
- ret <8 x double> %res
-}
-
-define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
-; CHECK-LABEL: test_x86_mask_blend_pd_512_memop:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %b = load <8 x double>, <8 x double>* %ptr
- %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
-
-define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_d_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
-
-define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
-; CHECK-LABEL: test_x86_mask_blend_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
-
define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
; CHECK-LABEL: test_cmpps:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
ret i16 %res
@@ -773,6 +747,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) no
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
ret i8 %res
@@ -825,8 +800,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpabsq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpabsq %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
@@ -837,149 +811,41 @@ define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x
ret <8 x i64> %res2
}
-define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
+define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
; CHECK-LABEL: test_vptestmq:
; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
- ret i8 %res
+ %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
+ %res2 = add i8 %res1, %res
+ ret i8 %res2
}
-declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
+declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
-define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
+define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
; CHECK-LABEL: test_vptestmd:
; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
- ret i16 %res
-}
-declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
-
-define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_store1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
-
-define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_store2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
-
-define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_store_aligned_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
-
-define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_store_aligned_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
-; CHECK-NEXT: retq
- call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
- ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
-
-define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps (%rdi), %zmm0
-; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
- %res4 = fadd <16 x float> %res2, %res1
- ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
-
-define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovups (%rdi), %zmm0
-; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
- %res4 = fadd <16 x float> %res2, %res1
- ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
-
-define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovapd (%rdi), %zmm0
-; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x double> %res2, %res1
- ret <8 x double> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
-
-define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovupd (%rdi), %zmm0
-; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1}
-; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x double> %res2, %res1
- ret <8 x double> %res4
+ %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
+ %res2 = add i16 %res1, %res
+ ret i16 %res2
}
-
-declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
+declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_valign_q:
@@ -993,8 +859,7 @@ define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
; CHECK-LABEL: test_mask_valign_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1028,127 +893,33 @@ define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
-define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_pcmpeq_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
-
-define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_pcmpeq_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
-
-define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_pcmpgt_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
-
-define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_pcmpgt_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
-
define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_cmp_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k7
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1174,29 +945,29 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k2, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1223,29 +994,29 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_ucmp_d_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpequd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k7
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1271,29 +1042,29 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpequd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %esi
-; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k2, %eax
; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: vmovd %r8d, %xmm0
-; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vmovd %ecx, %xmm0
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k7, %eax
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -1320,36 +1091,28 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_cmp_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k7
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
@@ -1374,39 +1137,30 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k1 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
@@ -1434,36 +1188,28 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_ucmp_q_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpequq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k2
+; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k3
+; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k4
+; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k5
+; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k6
+; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k7
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
@@ -1488,39 +1234,30 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpequq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r8d
-; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r9d
-; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r10d
-; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %r11d
-; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edi
-; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k1 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %edx
-; CHECK-NEXT: movzbl %r8b, %esi
-; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r9b, %esi
-; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r10b, %esi
-; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %r11b, %esi
-; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dil, %esi
-; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k3, %eax
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k4, %eax
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k5, %eax
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %k6, %eax
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: kmovw %k7, %eax
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: kmovw %k1, %eax
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; CHECK-NEXT: retq
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
@@ -1591,204 +1328,6 @@ define <4 x double> @test_vextractf64x4(<8 x double> %a) {
declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
-define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx512_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx512_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
- ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
- ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
- ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
-; CHECK-LABEL: test_x86_avx512_psrai_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
- ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
- ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psll_d:
; CHECK: ## BB#0:
@@ -1833,8 +1372,7 @@ define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psll_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1845,8 +1383,7 @@ define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -1899,8 +1436,7 @@ define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1911,8 +1447,7 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -1965,8 +1500,7 @@ define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -1977,8 +1511,7 @@ define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2031,8 +1564,7 @@ define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2043,8 +1575,7 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2098,8 +1629,7 @@ define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2110,8 +1640,7 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2164,8 +1693,7 @@ define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2176,8 +1704,7 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2378,8 +1905,7 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float>
define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rn:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2390,8 +1916,7 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rd:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2402,8 +1927,7 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_ru:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2414,8 +1938,7 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2423,142 +1946,6 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
ret <8 x double> %res
}
-define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_xor_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
- ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_xor_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
- ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_or_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
- ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_or_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
- ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: test_and_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
- ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_and_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
- ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_xor_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
- ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_xor_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
- ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_or_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
- ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_or_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
- ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
-; CHECK-LABEL: test_and_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
- ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_and_epi64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovaps %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
- ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-
define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_add_epi32_rr:
; CHECK: ## BB#0:
@@ -2779,8 +2166,7 @@ define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2791,8 +2177,7 @@ define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -2812,8 +2197,7 @@ define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2825,8 +2209,7 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -2849,8 +2232,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2864,8 +2246,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -2889,8 +2270,7 @@ define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -2901,8 +2281,7 @@ define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -2922,8 +2301,7 @@ define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2935,8 +2313,7 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -2959,8 +2336,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -2974,8 +2350,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -2999,8 +2374,7 @@ define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -3011,8 +2385,7 @@ define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -3032,8 +2405,7 @@ define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3045,8 +2417,7 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -3070,8 +2441,7 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3086,8 +2456,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -3112,8 +2481,7 @@ define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -3124,8 +2492,7 @@ define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -3145,8 +2512,7 @@ define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3158,8 +2524,7 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -3183,8 +2548,7 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbk:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -3199,8 +2563,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -4314,8 +3677,7 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32
define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4347,8 +3709,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4380,8 +3741,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4411,8 +3771,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
@@ -4446,8 +3805,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1
@@ -4481,8 +3839,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
@@ -4517,8 +3874,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x do
define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm2
; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1
@@ -4556,8 +3912,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
@@ -4590,8 +3945,7 @@ declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x doub
define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
@@ -4617,142 +3971,6 @@ define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16
ret <16 x float> %res2
}
-declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
-; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
- %res2 = fadd <8 x double> %res, %res1
- ret <8 x double> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
-; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
- %res2 = fadd <16 x float> %res, %res1
- ret <16 x float> %res2
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
-; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
- %res2 = fadd <8 x double> %res, %res1
- ret <8 x double> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
-; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
- %res2 = fadd <16 x float> %res, %res1
- ret <16 x float> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6]
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
- %res3 = add <8 x i64> %res, %res1
- %res4 = add <8 x i64> %res2, %res3
- ret <8 x i64> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
-; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
- %res2 = add <16 x i32> %res, %res1
- ret <16 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpunpckldq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
-; CHECK-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
- %res2 = add <16 x i32> %res, %res1
- ret <16 x i32> %res2
-}
-
declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
@@ -4778,8 +3996,7 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
; CHECK-NEXT: retq
@@ -4861,8 +4078,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqw %zmm0, %xmm0
@@ -4882,8 +4098,7 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
; CHECK-NEXT: retq
@@ -4897,8 +4112,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm0
@@ -4932,8 +4146,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm0
@@ -4967,8 +4180,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovqd %zmm0, %ymm0
@@ -4988,8 +4200,7 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: retq
@@ -5003,8 +4214,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovsqd %zmm0, %ymm0
@@ -5038,8 +4248,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovusqd %zmm0, %ymm0
@@ -5277,8 +4486,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>,
define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5310,8 +4518,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5327,8 +4534,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>
define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
@@ -5344,8 +4550,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5377,8 +4582,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double
define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5410,8 +4614,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5427,8 +4630,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>
define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5461,8 +4663,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>,
define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -5505,39 +4706,6 @@ define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16
ret <16 x i32> %res2
}
-
-declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
-define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vscalefss %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
- %res2 = fadd <4 x float> %res, %res1
- ret <4 x float> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
-define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
- %res2 = fadd <2 x double> %res, %res1
- ret <2 x double> %res2
-}
-
declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
@@ -5601,8 +4769,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
@@ -5623,8 +4790,7 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
; CHECK-NEXT: kandw %k2, %k1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
@@ -5647,8 +4813,7 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
@@ -5661,15 +4826,16 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k1
; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k1
-; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: kmovw %edi, %k2
-; CHECK-NEXT: kandw %k2, %k1, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: shlb $7, %al
-; CHECK-NEXT: sarb $7, %al
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k2 {%k1}
+; CHECK-NEXT: kmovw %k2, %ecx
+; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: andb %cl, %al
+; CHECK-NEXT: andb %dl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
@@ -5688,7 +4854,7 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5703,10 +4869,9 @@ declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>
define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
@@ -5726,7 +4891,7 @@ define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32>
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5741,9 +4906,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8
define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5758,8 +4922,7 @@ declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8
define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
@@ -5842,10 +5005,9 @@ declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double
define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufpd {{.*#+}} zmm2 = zmm2[0],k1[1],zmm2[3],k1[2],zmm2[5],k1[4],zmm2[6],k1[6]
-; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = k1[0],zmm0[1],k1[3],zmm0[2],k1[5],zmm0[4],k1[6],zmm0[6]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vshufpd {{.*#+}} zmm3 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
@@ -5865,7 +5027,7 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufps {{.*#+}} zmm2 = zmm2[2,1],k1[1,0],zmm2[6,5],k1[5,4],zmm2[10,9],k1[9,8],zmm2[14,13],k1[13,12]
+; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
@@ -5875,54 +5037,12 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x
ret <16 x float> %res2
}
-declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 = zmm1[0,1,3,2,5,4,6,6]
-; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = k1[0,1,3,2,5,4,6,6]
-; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,3,2,5,4,6,6]
-; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res3, %res2
- ret <8 x double> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpermilps {{.*#+}} zmm1 = zmm1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT: vpermilps {{.*#+}} zmm2 = k1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res3, %res2
- ret <16 x float> %res4
-}
-
declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0
@@ -5957,9 +5077,9 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0,
ret <16 x float> %res4
}
-declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i8)
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
-define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i8 %x4) {
+define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
@@ -5969,17 +5089,17 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 %x4)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i8 %x4)
+ %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
%res3 = fadd <16 x float> %res, %res1
%res4 = fadd <16 x float> %res2, %res3
ret <16 x float> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i8)
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i8 %x4) {
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
@@ -5989,9 +5109,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 %x4)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i8 %x4)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res2, %res3
ret <16 x i32> %res4
@@ -6002,8 +5122,7 @@ declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x do
define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6023,8 +5142,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i3
define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6039,9 +5157,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
ret <8 x i64> %res4
}
-declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float>, <4 x float>, <2 x double>, i8, i32)
+declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
-define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<4 x float> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
+define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
@@ -6050,15 +5168,15 @@ define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<4 x float> %x0,<4
; CHECK-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
+ %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
%res2 = fadd <2 x double> %res, %res1
ret <2 x double> %res2
}
-declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double>, <2 x double>, <4 x float>, i8, i32)
+declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
-define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<2 x double> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
+define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
; CHECK: ## BB#0:
; CHECK-NEXT: andl $1, %edi
@@ -6067,8 +5185,8 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<2 x double> %x0,<2
; CHECK-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
+ %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
%res2 = fadd <4 x float> %res, %res1
ret <4 x float> %res2
}
@@ -6112,8 +5230,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0
@@ -6130,8 +5247,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0
@@ -6143,73 +5259,11 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
ret <8 x i64> %res2
}
-declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res2, %res3
- ret <8 x double> %res4
-}
-
define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeqsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
ret i32 %res
@@ -6218,9 +5272,8 @@ define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
ret i32 %res
@@ -6229,9 +5282,8 @@ define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1)
define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeqsd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
ret i32 %res
@@ -6240,9 +5292,8 @@ define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: vcmpeq_uqsd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
ret i32 %res
@@ -6251,9 +5302,8 @@ define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpltsd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
ret i32 %res
@@ -6262,9 +5312,8 @@ define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd {sae}, %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpngesd {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
ret i32 %res
@@ -6273,9 +5322,8 @@ define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1)
define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
; CHECK: ## BB#0:
-; CHECK-NEXT: vcomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpltsd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
ret i32 %res
@@ -6284,9 +5332,8 @@ define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomisd %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpngesd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
ret i32 %res
@@ -6297,9 +5344,8 @@ declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
; CHECK: ## BB#0:
-; CHECK-NEXT: vucomiss %xmm1, %xmm0
-; CHECK-NEXT: sbbl %eax, %eax
-; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: vcmpngess %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
%res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
ret i32 %res
@@ -6377,12 +5423,15 @@ declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vaddps %zmm1, %zmm0, %zmm0
-; CHECK: vaddps %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
%res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
@@ -6396,12 +5445,15 @@ declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vaddpd %zmm1, %zmm0, %zmm0
-; CHECK: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
%res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
@@ -6415,12 +5467,15 @@ declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32
define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
@@ -6434,12 +5489,15 @@ declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
@@ -6449,30 +5507,29 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x
ret <8 x i64> %res5
}
-declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpsrlq $255, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> %x2, i8 -1)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i8 255, <8 x i64> zeroinitializer, i8 %x3)
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> %x2, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 255, <8 x i64> zeroinitializer, i8 %x3)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
@@ -6482,17 +5539,17 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> zeroinitializer, i16 %x3)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
@@ -6502,38 +5559,37 @@ define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i8 %x1, <
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
@@ -6543,67 +5599,46 @@ define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i8 %x1, <
; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm2 {%k1} {z}
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0
; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+ %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i16, <16 x i32>, i8)
-
-define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i16 %x1, <16 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpshufd $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpshufd $3, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> zeroinitializer, i8 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 -1)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res3, %res2
- ret <16 x i32> %res4
-}
-
declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
@@ -6617,14 +5652,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -6633,61 +5667,58 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %
ret <8 x i64> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vprold $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res3, %res2
ret <16 x i32> %res4
}
-declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i8, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8)
-define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
+define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
%res3 = add <8 x i64> %res, %res1
%res4 = add <8 x i64> %res3, %res2
ret <8 x i64> %res4
}
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-
declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxbd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbd %xmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -6701,14 +5732,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxbq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -6722,14 +5752,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxdq %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxdq %ymm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -6743,13 +5772,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i
define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxwd %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwd %ymm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -6763,14 +5792,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovzxwq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -6784,13 +5812,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i1
define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -6804,14 +5832,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -6825,14 +5852,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -6847,13 +5873,13 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i
define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -6868,14 +5894,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -6884,4 +5909,532 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64>
ret <8 x i64> %res4
}
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res3, %res2
+ ret <8 x i64> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
+; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
+ %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
+; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 4)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
+; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vaddps %zmm4, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8)
+ %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 -1, i32 4)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
+; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+ %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
+
+define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastd %edi, %zmm2
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)
+
+define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm2
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)
+
+declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res4 = fadd <2 x double> %res, %res1
+ %res5 = fadd <2 x double> %res2, %res3
+ %res6 = fadd <2 x double> %res4, %res5
+ ret <2 x double> %res6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm0, %zmm5
+; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res4 = fadd <4 x float> %res, %res1
+ %res5 = fadd <4 x float> %res2, %res3
+ %res6 = fadd <4 x float> %res4, %res5
+ ret <4 x float> %res6
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
+; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm2, %zmm5
+; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3)
+ %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3)
+ %res4 = fadd <2 x double> %res, %res1
+ %res5 = fadd <2 x double> %res2, %res3
+ %res6 = fadd <2 x double> %res4, %res5
+ ret <2 x double> %res6
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm2, %zmm3
+; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm4
+; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
+; CHECK-NEXT: vmovaps %zmm2, %zmm5
+; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
+; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
+; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
+; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3)
+ %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
+ %res4 = fadd <4 x float> %res, %res1
+ %res5 = fadd <4 x float> %res2, %res3
+ %res6 = fadd <4 x float> %res4, %res5
+ ret <4 x float> %res6
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %esi
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %q = load float, float* %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
+ ret < 4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %esi
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %q = load float, float* %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
+ ret < 4 x float> %res
+}
+
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %q = load float, float* %ptr_b
+ %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+ %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4)
+ ret < 4 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
index c973b706e8fc..d085467868ab 100644
--- a/test/CodeGen/X86/avx512-logic.ll
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -17,6 +17,22 @@ entry:
ret <16 x i32> %x
}
+define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpandnd:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; ALL-NEXT: vpandnd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
+ i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %b2 = xor <16 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1,
+ i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %x = and <16 x i32> %a2, %b2
+ ret <16 x i32> %x
+}
+
define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpord:
; ALL: ## BB#0: ## %entry
@@ -58,6 +74,20 @@ entry:
ret <8 x i64> %x
}
+define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: vpandnq:
+; ALL: ## BB#0: ## %entry
+; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: retq
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ %b2 = xor <8 x i64> %b, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+ %x = and <8 x i64> %a2, %b2
+ ret <8 x i64> %x
+}
+
define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vporq:
; ALL: ## BB#0: ## %entry
@@ -133,6 +163,25 @@ define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
ret <64 x i8> %res
}
+define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; KNL-LABEL: andn_v64i8:
+; KNL: ## BB#0:
+; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: andn_v64i8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; SKX-NEXT: retq
+ %b2 = xor <64 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = and <64 x i8> %a, %b2
+ ret <64 x i8> %res
+}
+
define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: or_v64i8:
; KNL: ## BB#0:
@@ -178,6 +227,23 @@ define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
ret <32 x i16> %res
}
+define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; KNL-LABEL: andn_v32i16:
+; KNL: ## BB#0:
+; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: andn_v32i16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; SKX-NEXT: retq
+ %b2 = xor <32 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1,
+ i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = and <32 x i16> %a, %b2
+ ret <32 x i16> %res
+}
+
define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: or_v32i16:
; KNL: ## BB#0:
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 015c70a6ba08..cb63f9108e29 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
define i16 @mask16(i16 %x) {
; CHECK-LABEL: mask16:
@@ -8,6 +8,7 @@ define i16 @mask16(i16 %x) {
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: knotw %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -15,13 +16,27 @@ define i16 @mask16(i16 %x) {
ret i16 %ret
}
+define i32 @mask16_zext(i16 %x) {
+; CHECK-LABEL: mask16_zext:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %m2 = bitcast <16 x i1> %m1 to i16
+ %ret = zext i16 %m2 to i32
+ ret i32 %ret
+}
+
define i8 @mask8(i8 %x) {
; KNL-LABEL: mask8:
; KNL: ## BB#0:
-; KNL-NEXT: movzbl %dil, %eax
-; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: mask8:
@@ -29,6 +44,7 @@ define i8 @mask8(i8 %x) {
; SKX-NEXT: kmovb %edi, %k0
; SKX-NEXT: knotb %k0, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -36,6 +52,27 @@ define i8 @mask8(i8 %x) {
ret i8 %ret
}
+define i32 @mask8_zext(i8 %x) {
+; KNL-LABEL: mask8_zext:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: mask8_zext:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb %edi, %k0
+; SKX-NEXT: knotb %k0, %k0
+; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: retq
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %m2 = bitcast <8 x i1> %m1 to i8
+ %ret = zext i8 %m2 to i32
+ ret i32 %ret
+}
+
define void @mask16_mem(i16* %ptr) {
; CHECK-LABEL: mask16_mem:
; CHECK: ## BB#0:
@@ -54,9 +91,11 @@ define void @mask16_mem(i16* %ptr) {
define void @mask8_mem(i8* %ptr) {
; KNL-LABEL: mask8_mem:
; KNL: ## BB#0:
-; KNL-NEXT: kmovw (%rdi), %k0
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k0
; KNL-NEXT: knotw %k0, %k0
-; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: mask8_mem:
@@ -76,15 +115,34 @@ define void @mask8_mem(i8* %ptr) {
define i16 @mand16(i16 %x, i16 %y) {
; CHECK-LABEL: mand16:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: xorl %esi, %eax
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %ma = bitcast i16 %x to <16 x i1>
+ %mb = bitcast i16 %y to <16 x i1>
+ %mc = and <16 x i1> %ma, %mb
+ %md = xor <16 x i1> %ma, %mb
+ %me = or <16 x i1> %mc, %md
+ %ret = bitcast <16 x i1> %me to i16
+ ret i16 %ret
+}
+
+define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
+; CHECK-LABEL: mand16_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw (%rdi), %k0
+; CHECK-NEXT: kmovw (%rsi), %k1
; CHECK-NEXT: kandw %k1, %k0, %k2
; CHECK-NEXT: kxorw %k1, %k0, %k0
; CHECK-NEXT: korw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
- %ma = bitcast i16 %x to <16 x i1>
- %mb = bitcast i16 %y to <16 x i1>
+ %ma = load <16 x i1>, <16 x i1>* %x
+ %mb = load <16 x i1>, <16 x i1>* %y
%mc = and <16 x i1> %ma, %mb
%md = xor <16 x i1> %ma, %mb
%me = or <16 x i1> %mc, %md
@@ -98,6 +156,7 @@ define i8 @shuf_test1(i16 %v) nounwind {
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kshiftrw $8, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: shuf_test1:
@@ -105,6 +164,7 @@ define i8 @shuf_test1(i16 %v) nounwind {
; SKX-NEXT: kmovw %edi, %k0
; SKX-NEXT: kshiftrw $8, %k0, %k0
; SKX-NEXT: kmovb %k0, %eax
+; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
%v1 = bitcast i16 %v to <16 x i1>
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -119,18 +179,36 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
; CHECK-NEXT: kshiftlw $10, %k0, %k0
; CHECK-NEXT: kshiftrw $15, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i32
ret i32 %res
-}define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+}
+
+define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: kshiftlw $10, %k0, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i16
ret i16 %res
-}define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+}
+
+define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: kshiftlw $10, %k0, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
%cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
%res = zext i1 %cmp_res.i1 to i8
@@ -232,7 +310,6 @@ define void @test7(<8 x i1> %mask) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: movb $85, %al
-; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -246,8 +323,7 @@ define void @test7(<8 x i1> %mask) {
; SKX-NEXT: movb $85, %al
; SKX-NEXT: kmovb %eax, %k1
; SKX-NEXT: korb %k1, %k0, %k0
-; SKX-NEXT: kmovb %k0, %eax
-; SKX-NEXT: testb %al, %al
+; SKX-NEXT: ktestb %k0, %k0
; SKX-NEXT: retq
allocas:
%a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
@@ -266,14 +342,15 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL: ## BB#0:
; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: jg LBB14_1
+; KNL-NEXT: jg LBB17_1
; KNL-NEXT: ## BB#2:
; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1
-; KNL-NEXT: jmp LBB14_3
-; KNL-NEXT: LBB14_1:
+; KNL-NEXT: jmp LBB17_3
+; KNL-NEXT: LBB17_1:
; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
-; KNL-NEXT: LBB14_3:
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: LBB17_3:
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -281,12 +358,12 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; SKX: ## BB#0:
; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2
; SKX-NEXT: cmpl %esi, %edi
-; SKX-NEXT: jg LBB14_1
+; SKX-NEXT: jg LBB17_1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
-; SKX-NEXT: LBB14_1:
+; SKX-NEXT: LBB17_1:
; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
@@ -301,29 +378,30 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test9:
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: jg LBB15_1
+; KNL-NEXT: jg LBB18_1
; KNL-NEXT: ## BB#2:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
-; KNL-NEXT: jmp LBB15_3
-; KNL-NEXT: LBB15_1:
+; KNL-NEXT: jmp LBB18_3
+; KNL-NEXT: LBB18_1:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: LBB15_3:
+; KNL-NEXT: LBB18_3:
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: cmpl %esi, %edi
-; SKX-NEXT: jg LBB15_1
+; SKX-NEXT: jg LBB18_1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm0
-; SKX-NEXT: jmp LBB15_3
-; SKX-NEXT: LBB15_1:
+; SKX-NEXT: jmp LBB18_3
+; SKX-NEXT: LBB18_1:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
-; SKX-NEXT: LBB15_3:
+; SKX-NEXT: LBB18_3:
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: retq
@@ -340,23 +418,23 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test11:
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
-; KNL-NEXT: jg LBB17_2
+; KNL-NEXT: jg LBB20_2
; KNL-NEXT: ## BB#1:
; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: LBB17_2:
+; KNL-NEXT: LBB20_2:
; KNL-NEXT: retq
;
; SKX-LABEL: test11:
; SKX: ## BB#0:
; SKX-NEXT: cmpl %esi, %edi
-; SKX-NEXT: jg LBB17_1
+; SKX-NEXT: jg LBB20_1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpslld $31, %xmm1, %xmm0
-; SKX-NEXT: jmp LBB17_3
-; SKX-NEXT: LBB17_1:
+; SKX-NEXT: jmp LBB20_3
+; SKX-NEXT: LBB20_1:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: LBB17_3:
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: LBB20_3:
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
%mask = icmp sgt i32 %a1, %b1
@@ -399,7 +477,8 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
; KNL-NEXT: movw $1, %cx
; KNL-NEXT: cmovgw %ax, %cx
; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -420,6 +499,7 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
}
define <64 x i8> @test16(i64 %x) {
+;
; KNL-LABEL: test16:
; KNL: ## BB#0:
; KNL-NEXT: pushq %rbp
@@ -430,432 +510,34 @@ define <64 x i8> @test16(i64 %x) {
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: Ltmp2:
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: pushq %r15
-; KNL-NEXT: pushq %r14
-; KNL-NEXT: pushq %r13
-; KNL-NEXT: pushq %r12
-; KNL-NEXT: pushq %rbx
; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: Ltmp3:
-; KNL-NEXT: .cfi_offset %rbx, -56
-; KNL-NEXT: Ltmp4:
-; KNL-NEXT: .cfi_offset %r12, -48
-; KNL-NEXT: Ltmp5:
-; KNL-NEXT: .cfi_offset %r13, -40
-; KNL-NEXT: Ltmp6:
-; KNL-NEXT: .cfi_offset %r14, -32
-; KNL-NEXT: Ltmp7:
-; KNL-NEXT: .cfi_offset %r15, -24
-; KNL-NEXT: movq %rdi, %rax
-; KNL-NEXT: shrq $32, %rax
-; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl $271, %eax ## imm = 0x10F
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: movl %edi, %ecx
-; KNL-NEXT: andl $1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: movl $257, %ecx ## imm = 0x101
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $258, %ecx ## imm = 0x102
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $259, %ecx ## imm = 0x103
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $260, %ecx ## imm = 0x104
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $261, %ecx ## imm = 0x105
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $262, %ecx ## imm = 0x106
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $263, %ecx ## imm = 0x107
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $264, %ecx ## imm = 0x108
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $265, %ecx ## imm = 0x109
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $266, %ecx ## imm = 0x10A
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $267, %ecx ## imm = 0x10B
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $268, %ecx ## imm = 0x10C
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $269, %ecx ## imm = 0x10D
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT: movl $270, %ecx ## imm = 0x10E
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: movl %edi, (%rsp)
+; KNL-NEXT: shrq $32, %rdi
+; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: kmovw (%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2
; KNL-NEXT: movl $1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d
-; KNL-NEXT: movq %r15, %rdx
-; KNL-NEXT: shrq $17, %rdx
-; KNL-NEXT: andb $1, %dl
-; KNL-NEXT: je LBB22_2
-; KNL-NEXT: ## BB#1:
-; KNL-NEXT: movb $-1, %dl
-; KNL-NEXT: LBB22_2:
-; KNL-NEXT: movq %r15, %r11
-; KNL-NEXT: shrq $16, %r11
-; KNL-NEXT: andb $1, %r11b
-; KNL-NEXT: je LBB22_4
-; KNL-NEXT: ## BB#3:
-; KNL-NEXT: movb $-1, %r11b
-; KNL-NEXT: LBB22_4:
-; KNL-NEXT: movq %r15, %r10
-; KNL-NEXT: shrq $18, %r10
-; KNL-NEXT: andb $1, %r10b
-; KNL-NEXT: je LBB22_6
-; KNL-NEXT: ## BB#5:
-; KNL-NEXT: movb $-1, %r10b
-; KNL-NEXT: LBB22_6:
-; KNL-NEXT: movq %r15, %r9
-; KNL-NEXT: shrq $19, %r9
-; KNL-NEXT: andb $1, %r9b
-; KNL-NEXT: je LBB22_8
-; KNL-NEXT: ## BB#7:
-; KNL-NEXT: movb $-1, %r9b
-; KNL-NEXT: LBB22_8:
-; KNL-NEXT: movq %r15, %rbx
-; KNL-NEXT: shrq $20, %rbx
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB22_10
-; KNL-NEXT: ## BB#9:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB22_10:
-; KNL-NEXT: movq %r15, %r12
-; KNL-NEXT: shrq $21, %r12
-; KNL-NEXT: andb $1, %r12b
-; KNL-NEXT: je LBB22_12
-; KNL-NEXT: ## BB#11:
-; KNL-NEXT: movb $-1, %r12b
-; KNL-NEXT: LBB22_12:
-; KNL-NEXT: movq %r15, %r14
-; KNL-NEXT: shrq $22, %r14
-; KNL-NEXT: andb $1, %r14b
-; KNL-NEXT: je LBB22_14
-; KNL-NEXT: ## BB#13:
-; KNL-NEXT: movb $-1, %r14b
-; KNL-NEXT: LBB22_14:
-; KNL-NEXT: movq %r15, %r8
-; KNL-NEXT: shrq $23, %r8
-; KNL-NEXT: andb $1, %r8b
-; KNL-NEXT: je LBB22_16
-; KNL-NEXT: ## BB#15:
-; KNL-NEXT: movb $-1, %r8b
-; KNL-NEXT: LBB22_16:
-; KNL-NEXT: movq %r15, %r13
-; KNL-NEXT: shrq $24, %r13
-; KNL-NEXT: andb $1, %r13b
-; KNL-NEXT: je LBB22_18
-; KNL-NEXT: ## BB#17:
-; KNL-NEXT: movb $-1, %r13b
-; KNL-NEXT: LBB22_18:
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $25, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_20
-; KNL-NEXT: ## BB#19:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_20:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $26, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_22
-; KNL-NEXT: ## BB#21:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_22:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $272, %esi ## imm = 0x110
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $27, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_24
-; KNL-NEXT: ## BB#23:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_24:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $273, %eax ## imm = 0x111
-; KNL-NEXT: bextrl %esi, %edi, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $28, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB22_26
-; KNL-NEXT: ## BB#25:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB22_26:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vmovd %esi, %xmm2
-; KNL-NEXT: movl $274, %esi ## imm = 0x112
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $29, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB22_28
-; KNL-NEXT: ## BB#27:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB22_28:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %eax
-; KNL-NEXT: movzbl %r11b, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $30, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB22_30
-; KNL-NEXT: ## BB#29:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB22_30:
-; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT: movl $275, %eax ## imm = 0x113
-; KNL-NEXT: bextrl %eax, %edi, %r11d
-; KNL-NEXT: movzbl %dl, %edx
-; KNL-NEXT: vmovd %esi, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $31, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_32
-; KNL-NEXT: ## BB#31:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_32:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT: movl $276, %eax ## imm = 0x114
-; KNL-NEXT: bextrl %eax, %edi, %esi
-; KNL-NEXT: movl $277, %r11d ## imm = 0x115
-; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r10b, %r10d
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_34
-; KNL-NEXT: ## BB#33:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_34:
-; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $278, %r11d ## imm = 0x116
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r9b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shlq $63, %rcx
-; KNL-NEXT: sarq $63, %rcx
-; KNL-NEXT: vmovd %ecx, %xmm4
-; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $2, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_36
-; KNL-NEXT: ## BB#35:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_36:
-; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $279, %r9d ## imm = 0x117
-; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %bl, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $3, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_38
-; KNL-NEXT: ## BB#37:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_38:
-; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r9d, %edi, %edx
-; KNL-NEXT: movl $280, %esi ## imm = 0x118
-; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r12b, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $4, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_40
-; KNL-NEXT: ## BB#39:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_40:
-; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %ecx
-; KNL-NEXT: movl $281, %edx ## imm = 0x119
-; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r14b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $5, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_42
-; KNL-NEXT: ## BB#41:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_42:
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $282, %edx ## imm = 0x11A
-; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r8b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $6, %bl
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB22_44
-; KNL-NEXT: ## BB#43:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB22_44:
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %eax
-; KNL-NEXT: movl $283, %ecx ## imm = 0x11B
-; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r13b, %esi
-; KNL-NEXT: movzbl %bl, %edx
-; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $7, %bl
-; KNL-NEXT: je LBB22_46
-; KNL-NEXT: ## BB#45:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB22_46:
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: movl $284, %edx ## imm = 0x11C
-; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
-; KNL-NEXT: movzbl %al, %esi
-; KNL-NEXT: movzbl %bl, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $8, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_48
-; KNL-NEXT: ## BB#47:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_48:
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $285, %edx ## imm = 0x11D
-; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $9, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_50
-; KNL-NEXT: ## BB#49:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_50:
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $286, %edx ## imm = 0x11E
-; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $10, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_52
-; KNL-NEXT: ## BB#51:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_52:
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %edx
-; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $11, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_54
-; KNL-NEXT: ## BB#53:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_54:
-; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
-; KNL-NEXT: shrl $31, %edi
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $12, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_56
-; KNL-NEXT: ## BB#55:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_56:
-; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $13, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_58
-; KNL-NEXT: ## BB#57:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_58:
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $14, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB22_60
-; KNL-NEXT: ## BB#59:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB22_60:
-; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2
-; KNL-NEXT: shrq $15, %r15
-; KNL-NEXT: andb $1, %r15b
-; KNL-NEXT: je LBB22_62
-; KNL-NEXT: ## BB#61:
-; KNL-NEXT: movb $-1, %r15b
-; KNL-NEXT: LBB22_62:
-; KNL-NEXT: movzbl %r15b, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; KNL-NEXT: vpsllw $7, %ymm2, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT: leaq -40(%rbp), %rsp
-; KNL-NEXT: popq %rbx
-; KNL-NEXT: popq %r12
-; KNL-NEXT: popq %r13
-; KNL-NEXT: popq %r14
-; KNL-NEXT: popq %r15
+; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
@@ -875,444 +557,47 @@ define <64 x i8> @test16(i64 %x) {
}
define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
+;
; KNL-LABEL: test17:
; KNL: ## BB#0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Ltmp8:
+; KNL-NEXT: Ltmp3:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Ltmp9:
+; KNL-NEXT: Ltmp4:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Ltmp10:
+; KNL-NEXT: Ltmp5:
; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: pushq %r15
-; KNL-NEXT: pushq %r14
-; KNL-NEXT: pushq %r13
-; KNL-NEXT: pushq %r12
-; KNL-NEXT: pushq %rbx
; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: Ltmp11:
-; KNL-NEXT: .cfi_offset %rbx, -56
-; KNL-NEXT: Ltmp12:
-; KNL-NEXT: .cfi_offset %r12, -48
-; KNL-NEXT: Ltmp13:
-; KNL-NEXT: .cfi_offset %r13, -40
-; KNL-NEXT: Ltmp14:
-; KNL-NEXT: .cfi_offset %r14, -32
-; KNL-NEXT: Ltmp15:
-; KNL-NEXT: .cfi_offset %r15, -24
-; KNL-NEXT: movq %rdi, %rax
-; KNL-NEXT: shrq $32, %rax
-; KNL-NEXT: movl %eax, {{[0-9]+}}(%rsp)
-; KNL-NEXT: movl %edi, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: vmovd %eax, %xmm0
-; KNL-NEXT: movl $257, %eax ## imm = 0x101
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $258, %eax ## imm = 0x102
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $259, %eax ## imm = 0x103
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $260, %eax ## imm = 0x104
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $261, %eax ## imm = 0x105
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $262, %eax ## imm = 0x106
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $263, %eax ## imm = 0x107
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $264, %eax ## imm = 0x108
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $265, %eax ## imm = 0x109
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $266, %eax ## imm = 0x10A
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $267, %eax ## imm = 0x10B
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $268, %eax ## imm = 0x10C
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $269, %eax ## imm = 0x10D
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $270, %eax ## imm = 0x10E
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: movl $271, %eax ## imm = 0x10F
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: movl %edi, (%rsp)
+; KNL-NEXT: shrq $32, %rdi
+; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: kmovw (%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %edx, %esi
; KNL-NEXT: setg %al
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm0
-; KNL-NEXT: movl {{[0-9]+}}(%rsp), %r15d
-; KNL-NEXT: movq %r15, %rdx
-; KNL-NEXT: shrq $17, %rdx
-; KNL-NEXT: andb $1, %dl
-; KNL-NEXT: je LBB23_2
-; KNL-NEXT: ## BB#1:
-; KNL-NEXT: movb $-1, %dl
-; KNL-NEXT: LBB23_2:
-; KNL-NEXT: movq %r15, %r11
-; KNL-NEXT: shrq $16, %r11
-; KNL-NEXT: andb $1, %r11b
-; KNL-NEXT: je LBB23_4
-; KNL-NEXT: ## BB#3:
-; KNL-NEXT: movb $-1, %r11b
-; KNL-NEXT: LBB23_4:
-; KNL-NEXT: movq %r15, %r10
-; KNL-NEXT: shrq $18, %r10
-; KNL-NEXT: andb $1, %r10b
-; KNL-NEXT: je LBB23_6
-; KNL-NEXT: ## BB#5:
-; KNL-NEXT: movb $-1, %r10b
-; KNL-NEXT: LBB23_6:
-; KNL-NEXT: movq %r15, %r9
-; KNL-NEXT: shrq $19, %r9
-; KNL-NEXT: andb $1, %r9b
-; KNL-NEXT: je LBB23_8
-; KNL-NEXT: ## BB#7:
-; KNL-NEXT: movb $-1, %r9b
-; KNL-NEXT: LBB23_8:
-; KNL-NEXT: movq %r15, %rbx
-; KNL-NEXT: shrq $20, %rbx
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB23_10
-; KNL-NEXT: ## BB#9:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB23_10:
-; KNL-NEXT: movq %r15, %r12
-; KNL-NEXT: shrq $21, %r12
-; KNL-NEXT: andb $1, %r12b
-; KNL-NEXT: je LBB23_12
-; KNL-NEXT: ## BB#11:
-; KNL-NEXT: movb $-1, %r12b
-; KNL-NEXT: LBB23_12:
-; KNL-NEXT: movq %r15, %r14
-; KNL-NEXT: shrq $22, %r14
-; KNL-NEXT: andb $1, %r14b
-; KNL-NEXT: je LBB23_14
-; KNL-NEXT: ## BB#13:
-; KNL-NEXT: movb $-1, %r14b
-; KNL-NEXT: LBB23_14:
-; KNL-NEXT: movq %r15, %r8
-; KNL-NEXT: shrq $23, %r8
-; KNL-NEXT: andb $1, %r8b
-; KNL-NEXT: je LBB23_16
-; KNL-NEXT: ## BB#15:
-; KNL-NEXT: movb $-1, %r8b
-; KNL-NEXT: LBB23_16:
-; KNL-NEXT: movq %r15, %r13
-; KNL-NEXT: shrq $24, %r13
-; KNL-NEXT: andb $1, %r13b
-; KNL-NEXT: je LBB23_18
-; KNL-NEXT: ## BB#17:
-; KNL-NEXT: movb $-1, %r13b
-; KNL-NEXT: LBB23_18:
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $25, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_20
-; KNL-NEXT: ## BB#19:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_20:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $26, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_22
-; KNL-NEXT: ## BB#21:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_22:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $272, %esi ## imm = 0x110
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $27, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_24
-; KNL-NEXT: ## BB#23:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_24:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movl $273, %eax ## imm = 0x111
-; KNL-NEXT: bextrl %esi, %edi, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $28, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB23_26
-; KNL-NEXT: ## BB#25:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB23_26:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: bextrl %eax, %edi, %eax
-; KNL-NEXT: vmovd %esi, %xmm2
-; KNL-NEXT: movl $274, %esi ## imm = 0x112
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $29, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB23_28
-; KNL-NEXT: ## BB#27:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB23_28:
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %eax
-; KNL-NEXT: movzbl %r11b, %esi
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shrq $30, %rcx
-; KNL-NEXT: andb $1, %cl
-; KNL-NEXT: je LBB23_30
-; KNL-NEXT: ## BB#29:
-; KNL-NEXT: movb $-1, %cl
-; KNL-NEXT: LBB23_30:
-; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT: movl $275, %eax ## imm = 0x113
-; KNL-NEXT: bextrl %eax, %edi, %r11d
-; KNL-NEXT: movzbl %dl, %edx
-; KNL-NEXT: vmovd %esi, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $31, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_32
-; KNL-NEXT: ## BB#31:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_32:
-; KNL-NEXT: movq %rax, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT: movl $276, %eax ## imm = 0x114
-; KNL-NEXT: bextrl %eax, %edi, %esi
-; KNL-NEXT: movl $277, %r11d ## imm = 0x115
-; KNL-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r10b, %r10d
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_34
-; KNL-NEXT: ## BB#33:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_34:
-; KNL-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $278, %r11d ## imm = 0x116
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r9b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: movq %r15, %rcx
-; KNL-NEXT: shlq $63, %rcx
-; KNL-NEXT: sarq $63, %rcx
-; KNL-NEXT: vmovd %ecx, %xmm4
-; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $2, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_36
-; KNL-NEXT: ## BB#35:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_36:
-; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r11d, %edi, %edx
-; KNL-NEXT: movl $279, %r9d ## imm = 0x117
-; KNL-NEXT: vpinsrb $3, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %bl, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $3, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_38
-; KNL-NEXT: ## BB#37:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_38:
-; KNL-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %r9d, %edi, %edx
-; KNL-NEXT: movl $280, %esi ## imm = 0x118
-; KNL-NEXT: vpinsrb $4, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r12b, %ebx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $4, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_40
-; KNL-NEXT: ## BB#39:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_40:
-; KNL-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %esi, %edi, %ecx
-; KNL-NEXT: movl $281, %edx ## imm = 0x119
-; KNL-NEXT: vpinsrb $5, %ebx, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r14b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %al
-; KNL-NEXT: shrb $5, %al
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_42
-; KNL-NEXT: ## BB#41:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_42:
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $282, %edx ## imm = 0x11A
-; KNL-NEXT: vpinsrb $6, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r8b, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $6, %bl
-; KNL-NEXT: andb $1, %bl
-; KNL-NEXT: je LBB23_44
-; KNL-NEXT: ## BB#43:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB23_44:
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %eax
-; KNL-NEXT: movl $283, %ecx ## imm = 0x11B
-; KNL-NEXT: vpinsrb $7, %esi, %xmm3, %xmm3
-; KNL-NEXT: movzbl %r13b, %esi
-; KNL-NEXT: movzbl %bl, %edx
-; KNL-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; KNL-NEXT: movb %r15b, %bl
-; KNL-NEXT: shrb $7, %bl
-; KNL-NEXT: je LBB23_46
-; KNL-NEXT: ## BB#45:
-; KNL-NEXT: movb $-1, %bl
-; KNL-NEXT: LBB23_46:
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: bextrl %ecx, %edi, %ecx
-; KNL-NEXT: movl $284, %edx ## imm = 0x11C
-; KNL-NEXT: vpinsrb $8, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rax ## 8-byte Reload
-; KNL-NEXT: movzbl %al, %esi
-; KNL-NEXT: movzbl %bl, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $8, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_48
-; KNL-NEXT: ## BB#47:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_48:
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $285, %edx ## imm = 0x11D
-; KNL-NEXT: vpinsrb $9, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $9, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_50
-; KNL-NEXT: ## BB#49:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_50:
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %ecx
-; KNL-NEXT: movl $286, %edx ## imm = 0x11E
-; KNL-NEXT: vpinsrb $10, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rsi ## 8-byte Reload
-; KNL-NEXT: movzbl %sil, %esi
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $10, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_52
-; KNL-NEXT: ## BB#51:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_52:
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; KNL-NEXT: bextrl %edx, %edi, %edx
-; KNL-NEXT: vpinsrb $11, %esi, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $11, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_54
-; KNL-NEXT: ## BB#53:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_54:
-; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
-; KNL-NEXT: shrl $31, %edi
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $12, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_56
-; KNL-NEXT: ## BB#55:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_56:
-; KNL-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $13, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_58
-; KNL-NEXT: ## BB#57:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_58:
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm2
-; KNL-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
-; KNL-NEXT: movzbl %cl, %ecx
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm3
-; KNL-NEXT: movq %r15, %rax
-; KNL-NEXT: shrq $14, %rax
-; KNL-NEXT: andb $1, %al
-; KNL-NEXT: je LBB23_60
-; KNL-NEXT: ## BB#59:
-; KNL-NEXT: movb $-1, %al
-; KNL-NEXT: LBB23_60:
-; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; KNL-NEXT: movzbl %al, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm2
-; KNL-NEXT: shrq $15, %r15
-; KNL-NEXT: andb $1, %r15b
-; KNL-NEXT: je LBB23_62
-; KNL-NEXT: ## BB#61:
-; KNL-NEXT: movb $-1, %r15b
-; KNL-NEXT: LBB23_62:
-; KNL-NEXT: movzbl %r15b, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
-; KNL-NEXT: leaq -40(%rbp), %rsp
-; KNL-NEXT: popq %rbx
-; KNL-NEXT: popq %r12
-; KNL-NEXT: popq %r13
-; KNL-NEXT: popq %r14
-; KNL-NEXT: popq %r15
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
@@ -1321,7 +606,6 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
; SKX-NEXT: kmovq %rdi, %k0
; SKX-NEXT: cmpl %edx, %esi
; SKX-NEXT: setg %al
-; SKX-NEXT: andl $1, %eax
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: kshiftlq $5, %k1, %k1
; SKX-NEXT: korq %k1, %k0, %k0
@@ -1337,8 +621,7 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-LABEL: test18:
; KNL: ## BB#0:
-; KNL-NEXT: movzbl %dil, %eax
-; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kshiftlw $7, %k1, %k2
; KNL-NEXT: kshiftrw $15, %k2, %k2
@@ -1348,7 +631,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kshiftlw $7, %k2, %k1
; KNL-NEXT: korw %k1, %k0, %k1
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -1392,9 +676,7 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
-; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
@@ -1403,24 +685,17 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; KNL-LABEL: test22:
; KNL: ## BB#0:
-; KNL-NEXT: vpextrd $3, %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vpextrd $2, %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vpextrd $1, %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vmovd %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: test22:
; SKX: ## BB#0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
store <4 x i1> %a, <4 x i1>* %addr
@@ -1430,20 +705,1243 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; KNL-LABEL: test23:
; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: movb %al, (%rdi)
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: test23:
; SKX: ## BB#0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
-; SKX-NEXT: vpmovq2m %xmm0, %k0
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
store <2 x i1> %a, <2 x i1>* %addr
ret void
}
+
+define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
+; KNL-LABEL: store_v1i1:
+; KNL: ## BB#0:
+; KNL-NEXT: andl $1, %edi
+; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: kxnorw %k0, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kxorw %k1, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rsi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v1i1:
+; SKX: ## BB#0:
+; SKX-NEXT: andl $1, %edi
+; SKX-NEXT: kmovw %edi, %k0
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: kshiftrw $15, %k1, %k1
+; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rsi)
+; SKX-NEXT: retq
+ %x = xor <1 x i1> %c, <i1 1>
+ store <1 x i1> %x, <1 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
+; KNL-LABEL: store_v2i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v2i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
+; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <2 x i1> %c, <i1 1, i1 1>
+ store <2 x i1> %x, <2 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
+; KNL-LABEL: store_v4i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v4i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
+ store <4 x i1> %x, <4 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
+; KNL-LABEL: store_v8i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v8i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: knotb %k0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <8 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+ store <8 x i1> %x, <8 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
+; KNL-LABEL: store_v16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: knotw %k0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_v16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k0
+; SKX-NEXT: knotw %k0, %k0
+; SKX-NEXT: kmovw %k0, (%rdi)
+; SKX-NEXT: retq
+ %x = xor <16 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+ store <16 x i1> %x, <16 x i1>* %ptr, align 4
+ ret void
+}
+
+;void f2(int);
+;void f1(int c)
+;{
+; static int v = 0;
+; if (v == 0)
+; v = 1;
+; else
+; v = 0;
+; f2(v);
+;}
+
+@f1.v = internal unnamed_addr global i1 false, align 4
+
+define void @f1(i32 %c) {
+; KNL-LABEL: f1:
+; KNL: ## BB#0: ## %entry
+; KNL-NEXT: movzbl {{.*}}(%rip), %edi
+; KNL-NEXT: movl %edi, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: kmovw %eax, %k0
+; KNL-NEXT: kxnorw %k0, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kxorw %k1, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, {{.*}}(%rip)
+; KNL-NEXT: xorl $1, %edi
+; KNL-NEXT: jmp _f2 ## TAILCALL
+;
+; SKX-LABEL: f1:
+; SKX: ## BB#0: ## %entry
+; SKX-NEXT: movzbl {{.*}}(%rip), %edi
+; SKX-NEXT: movl %edi, %eax
+; SKX-NEXT: andl $1, %eax
+; SKX-NEXT: kmovw %eax, %k0
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: kshiftrw $15, %k1, %k1
+; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: kmovb %k0, {{.*}}(%rip)
+; SKX-NEXT: xorl $1, %edi
+; SKX-NEXT: jmp _f2 ## TAILCALL
+entry:
+ %.b1 = load i1, i1* @f1.v, align 4
+ %not..b1 = xor i1 %.b1, true
+ store i1 %not..b1, i1* @f1.v, align 4
+ %0 = zext i1 %not..b1 to i32
+ tail call void @f2(i32 %0) #2
+ ret void
+}
+
+declare void @f2(i32) #1
+
+define void @store_i16_i1(i16 %x, i1 *%y) {
+; CHECK-LABEL: store_i16_i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: movb %dil, (%rsi)
+; CHECK-NEXT: retq
+ %c = trunc i16 %x to i1
+ store i1 %c, i1* %y
+ ret void
+}
+
+define void @store_i8_i1(i8 %x, i1 *%y) {
+; CHECK-LABEL: store_i8_i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: movb %dil, (%rsi)
+; CHECK-NEXT: retq
+ %c = trunc i8 %x to i1
+ store i1 %c, i1* %y
+ ret void
+}
+
+define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
+; KNL-LABEL: test_build_vec_v32i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT: vpand %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
+; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
+; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_build_vec_v32i1:
+; SKX: ## BB#0:
+; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
+; KNL-LABEL: test_build_vec_v64i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_build_vec_v64i1:
+; SKX: ## BB#0:
+; SKX-NEXT: movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544
+; SKX-NEXT: kmovq %rax, %k1
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %ret = select <64 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <64 x i8> %x, <64 x i8> zeroinitializer
+ ret <64 x i8> %ret
+}
+
+define void @ktest_1(<8 x double> %in, double * %base) {
+; KNL-LABEL: ktest_1:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovupd (%rdi), %zmm1
+; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; KNL-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
+; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: testb %al, %al
+; KNL-NEXT: je LBB41_2
+; KNL-NEXT: ## BB#1: ## %L1
+; KNL-NEXT: vmovapd %zmm0, (%rdi)
+; KNL-NEXT: retq
+; KNL-NEXT: LBB41_2: ## %L2
+; KNL-NEXT: vmovapd %zmm0, 8(%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ktest_1:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovupd (%rdi), %zmm1
+; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
+; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; SKX-NEXT: ktestb %k0, %k0
+; SKX-NEXT: je LBB41_2
+; SKX-NEXT: ## BB#1: ## %L1
+; SKX-NEXT: vmovapd %zmm0, (%rdi)
+; SKX-NEXT: retq
+; SKX-NEXT: LBB41_2: ## %L2
+; SKX-NEXT: vmovapd %zmm0, 8(%rdi)
+; SKX-NEXT: retq
+ %addr1 = getelementptr double, double * %base, i64 0
+ %addr2 = getelementptr double, double * %base, i64 1
+
+ %vaddr1 = bitcast double* %addr1 to <8 x double>*
+ %vaddr2 = bitcast double* %addr2 to <8 x double>*
+
+ %val1 = load <8 x double>, <8 x double> *%vaddr1, align 1
+ %val2 = load <8 x double>, <8 x double> *%vaddr2, align 1
+
+ %sel1 = fcmp ogt <8 x double>%in, %val1
+ %val3 = select <8 x i1> %sel1, <8 x double> %val2, <8 x double> zeroinitializer
+ %sel2 = fcmp olt <8 x double> %in, %val3
+ %sel3 = and <8 x i1> %sel1, %sel2
+
+ %int_sel3 = bitcast <8 x i1> %sel3 to i8
+ %res = icmp eq i8 %int_sel3, zeroinitializer
+ br i1 %res, label %L2, label %L1
+L1:
+ store <8 x double> %in, <8 x double>* %vaddr1
+ br label %End
+L2:
+ store <8 x double> %in, <8 x double>* %vaddr2
+ br label %End
+End:
+ ret void
+}
+
+define void @ktest_2(<32 x float> %in, float * %base) {
+;
+; KNL-LABEL: ktest_2:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Ltmp6:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: Ltmp7:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: Ltmp8:
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $32, %rsp
+; KNL-NEXT: vmovups (%rdi), %zmm2
+; KNL-NEXT: vmovups 64(%rdi), %zmm3
+; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $0, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm2
+; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $13, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $2, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $1, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $0, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL-NEXT: vpsllw $7, %ymm2, %ymm2
+; KNL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; KNL-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z}
+; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z}
+; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm4
+; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
+; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vpslld $31, %zmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: cmpl $0, (%rsp)
+; KNL-NEXT: je LBB42_2
+; KNL-NEXT: ## BB#1: ## %L1
+; KNL-NEXT: vmovaps %zmm0, (%rdi)
+; KNL-NEXT: vmovaps %zmm1, 64(%rdi)
+; KNL-NEXT: jmp LBB42_3
+; KNL-NEXT: LBB42_2: ## %L2
+; KNL-NEXT: vmovaps %zmm0, 4(%rdi)
+; KNL-NEXT: vmovaps %zmm1, 68(%rdi)
+; KNL-NEXT: LBB42_3: ## %End
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: ktest_2:
+; SKX: ## BB#0:
+; SKX-NEXT: vmovups (%rdi), %zmm2
+; SKX-NEXT: vmovups 64(%rdi), %zmm3
+; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1
+; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2
+; SKX-NEXT: kunpckwd %k1, %k2, %k0
+; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z}
+; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z}
+; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1
+; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2
+; SKX-NEXT: kunpckwd %k1, %k2, %k1
+; SKX-NEXT: kord %k1, %k0, %k0
+; SKX-NEXT: ktestd %k0, %k0
+; SKX-NEXT: je LBB42_2
+; SKX-NEXT: ## BB#1: ## %L1
+; SKX-NEXT: vmovaps %zmm0, (%rdi)
+; SKX-NEXT: vmovaps %zmm1, 64(%rdi)
+; SKX-NEXT: retq
+; SKX-NEXT: LBB42_2: ## %L2
+; SKX-NEXT: vmovaps %zmm0, 4(%rdi)
+; SKX-NEXT: vmovaps %zmm1, 68(%rdi)
+; SKX-NEXT: retq
+ %addr1 = getelementptr float, float * %base, i64 0
+ %addr2 = getelementptr float, float * %base, i64 1
+
+ %vaddr1 = bitcast float* %addr1 to <32 x float>*
+ %vaddr2 = bitcast float* %addr2 to <32 x float>*
+
+ %val1 = load <32 x float>, <32 x float> *%vaddr1, align 1
+ %val2 = load <32 x float>, <32 x float> *%vaddr2, align 1
+
+ %sel1 = fcmp ogt <32 x float>%in, %val1
+ %val3 = select <32 x i1> %sel1, <32 x float> %val2, <32 x float> zeroinitializer
+ %sel2 = fcmp olt <32 x float> %in, %val3
+ %sel3 = or <32 x i1> %sel1, %sel2
+
+ %int_sel3 = bitcast <32 x i1> %sel3 to i32
+ %res = icmp eq i32 %int_sel3, zeroinitializer
+ br i1 %res, label %L2, label %L1
+L1:
+ store <32 x float> %in, <32 x float>* %vaddr1
+ br label %End
+L2:
+ store <32 x float> %in, <32 x float>* %vaddr2
+ br label %End
+End:
+ ret void
+}
+
+define <8 x i64> @load_8i1(<8 x i1>* %a) {
+; KNL-LABEL: load_8i1:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_8i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: vpmovm2q %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <8 x i1>, <8 x i1>* %a
+ %c = sext <8 x i1> %b to <8 x i64>
+ ret <8 x i64> %c
+}
+
+define <16 x i32> @load_16i1(<16 x i1>* %a) {
+; KNL-LABEL: load_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw (%rdi), %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovw (%rdi), %k0
+; SKX-NEXT: vpmovm2d %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <16 x i1>, <16 x i1>* %a
+ %c = sext <16 x i1> %b to <16 x i32>
+ ret <16 x i32> %c
+}
+
+define <2 x i16> @load_2i1(<2 x i1>* %a) {
+; KNL-LABEL: load_2i1:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_2i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: vpmovm2q %k0, %xmm0
+; SKX-NEXT: retq
+ %b = load <2 x i1>, <2 x i1>* %a
+ %c = sext <2 x i1> %b to <2 x i16>
+ ret <2 x i16> %c
+}
+
+define <4 x i16> @load_4i1(<4 x i1>* %a) {
+; KNL-LABEL: load_4i1:
+; KNL: ## BB#0:
+; KNL-NEXT: movzbl (%rdi), %eax
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_4i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
+ %b = load <4 x i1>, <4 x i1>* %a
+ %c = sext <4 x i1> %b to <4 x i16>
+ ret <4 x i16> %c
+}
+
+define <32 x i16> @load_32i1(<32 x i1>* %a) {
+; KNL-LABEL: load_32i1:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw (%rdi), %k1
+; KNL-NEXT: kmovw 2(%rdi), %k2
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpmovdw %zmm1, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_32i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovd (%rdi), %k0
+; SKX-NEXT: vpmovm2w %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <32 x i1>, <32 x i1>* %a
+ %c = sext <32 x i1> %b to <32 x i16>
+ ret <32 x i16> %c
+}
+
+define <64 x i8> @load_64i1(<64 x i1>* %a) {
+; KNL-LABEL: load_64i1:
+; KNL: ## BB#0:
+; KNL-NEXT: kmovw (%rdi), %k1
+; KNL-NEXT: kmovw 2(%rdi), %k2
+; KNL-NEXT: kmovw 4(%rdi), %k3
+; KNL-NEXT: kmovw 6(%rdi), %k4
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: retq
+;
+; SKX-LABEL: load_64i1:
+; SKX: ## BB#0:
+; SKX-NEXT: kmovq (%rdi), %k0
+; SKX-NEXT: vpmovm2b %k0, %zmm0
+; SKX-NEXT: retq
+ %b = load <64 x i1>, <64 x i1>* %a
+ %c = sext <64 x i1> %b to <64 x i8>
+ ret <64 x i8> %c
+}
+
+define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
+; KNL-LABEL: store_8i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_8i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ store <8 x i1> %v, <8 x i1>* %a
+ ret void
+}
+
+define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
+; KNL-LABEL: store_8i1_1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_8i1_1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k0
+; SKX-NEXT: kmovb %k0, (%rdi)
+; SKX-NEXT: retq
+ %v1 = trunc <8 x i16> %v to <8 x i1>
+ store <8 x i1> %v1, <8 x i1>* %a
+ ret void
+}
+
+define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
+; KNL-LABEL: store_16i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_16i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k0
+; SKX-NEXT: kmovw %k0, (%rdi)
+; SKX-NEXT: retq
+ store <16 x i1> %v, <16 x i1>* %a
+ ret void
+}
+
+define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
+; KNL-LABEL: store_32i1:
+; KNL: ## BB#0:
+; KNL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, 2(%rdi)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_32i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k0
+; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: retq
+ store <32 x i1> %v, <32 x i1>* %a
+ ret void
+}
+
+define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
+; KNL-LABEL: store_32i1_1:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, 2(%rdi)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_32i1_1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %zmm0, %zmm0
+; SKX-NEXT: vpmovw2m %zmm0, %k0
+; SKX-NEXT: kmovd %k0, (%rdi)
+; SKX-NEXT: retq
+ %v1 = trunc <32 x i16> %v to <32 x i1>
+ store <32 x i1> %v1, <32 x i1>* %a
+ ret void
+}
+
+
+define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
+;
+; KNL-LABEL: store_64i1:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Ltmp9:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: pushq %r15
+; KNL-NEXT: Ltmp10:
+; KNL-NEXT: .cfi_def_cfa_offset 24
+; KNL-NEXT: pushq %r14
+; KNL-NEXT: Ltmp11:
+; KNL-NEXT: .cfi_def_cfa_offset 32
+; KNL-NEXT: pushq %r13
+; KNL-NEXT: Ltmp12:
+; KNL-NEXT: .cfi_def_cfa_offset 40
+; KNL-NEXT: pushq %r12
+; KNL-NEXT: Ltmp13:
+; KNL-NEXT: .cfi_def_cfa_offset 48
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: Ltmp14:
+; KNL-NEXT: .cfi_def_cfa_offset 56
+; KNL-NEXT: Ltmp15:
+; KNL-NEXT: .cfi_offset %rbx, -56
+; KNL-NEXT: Ltmp16:
+; KNL-NEXT: .cfi_offset %r12, -48
+; KNL-NEXT: Ltmp17:
+; KNL-NEXT: .cfi_offset %r13, -40
+; KNL-NEXT: Ltmp18:
+; KNL-NEXT: .cfi_offset %r14, -32
+; KNL-NEXT: Ltmp19:
+; KNL-NEXT: .cfi_offset %r15, -24
+; KNL-NEXT: Ltmp20:
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vpslld $31, %zmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vmovd %r9d, %xmm3
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, 6(%rdi)
+; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $13, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $2, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $1, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %r10d, %xmm2
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: kshiftlw $0, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
+; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, 4(%rdi)
+; KNL-NEXT: kshiftlw $14, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r8d
+; KNL-NEXT: kshiftlw $15, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: kshiftlw $13, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r9d
+; KNL-NEXT: kshiftlw $12, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r11d
+; KNL-NEXT: kshiftlw $11, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r14d
+; KNL-NEXT: kshiftlw $10, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r15d
+; KNL-NEXT: kshiftlw $9, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r12d
+; KNL-NEXT: kshiftlw $8, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %r13d
+; KNL-NEXT: kshiftlw $7, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kshiftlw $6, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %esi
+; KNL-NEXT: kshiftlw $5, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebp
+; KNL-NEXT: kshiftlw $4, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ebx
+; KNL-NEXT: kshiftlw $3, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $2, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kshiftlw $1, %k1, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vmovd %r10d, %xmm1
+; KNL-NEXT: kmovw %k0, %r10d
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $0, %k1, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
+; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: kmovw %k1, 2(%rdi)
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r8d
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r10d
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r11d
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r14d
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r15d
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r12d
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %r13d
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %esi
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebp
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ebx
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: vmovd %r9d, %xmm0
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: popq %r12
+; KNL-NEXT: popq %r13
+; KNL-NEXT: popq %r14
+; KNL-NEXT: popq %r15
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
+; SKX-LABEL: store_64i1:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: kmovq %k0, (%rdi)
+; SKX-NEXT: retq
+ store <64 x i1> %v, <64 x i1>* %a
+ ret void
+}
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
new file mode 100644
index 000000000000..68d283f0e33f
--- /dev/null
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+
+declare void @f()
+define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_4i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp0:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2d %k0, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+
+ %cmp_res = icmp ugt <4 x i32> %a, %b
+ %cmp_res2 = icmp sgt <4 x i32> %a, %b
+ call void @f()
+ %res = or <4 x i1> %cmp_res, %cmp_res2
+ ret <4 x i1> %res
+}
+
+define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_8i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp1:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT: korb %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2w %k0, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+
+ %cmp_res = icmp ugt <8 x i32> %a, %b
+ %cmp_res2 = icmp sgt <8 x i32> %a, %b
+ call void @f()
+ %res = or <8 x i1> %cmp_res, %cmp_res2
+ ret <8 x i1> %res
+}
+
+define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_16i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp2:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
+; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2b %k0, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %cmp_res = icmp ugt <16 x i32> %a, %b
+ %cmp_res2 = icmp sgt <16 x i32> %a, %b
+ call void @f()
+ %res = or <16 x i1> %cmp_res, %cmp_res2
+ ret <16 x i1> %res
+}
+
+define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_32i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: Ltmp3:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
+; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload
+; CHECK-NEXT: kord %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2b %k0, %ymm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %cmp_res = icmp ugt <32 x i16> %a, %b
+ %cmp_res2 = icmp sgt <32 x i16> %a, %b
+ call void @f()
+ %res = or <32 x i1> %cmp_res, %cmp_res2
+ ret <32 x i1> %res
+}
+
+define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_64i1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: Ltmp4:
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: callq _f
+; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
+; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
+; CHECK-NEXT: korq %k1, %k0, %k0
+; CHECK-NEXT: vpmovm2b %k0, %zmm0
+; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: retq
+
+ %cmp_res = icmp ugt <64 x i8> %a, %b
+ %cmp_res2 = icmp sgt <64 x i8> %a, %b
+ call void @f()
+ %res = or <64 x i1> %cmp_res, %cmp_res2
+ ret <64 x i1> %res
+}
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 0cd8458f73f5..6b07e9e704db 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -1,279 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
-; CHECK-LABEL: @test1
-; CHECK: vmovd %xmm0, %eax ## encoding: [0x62
-; CHECK: ret
define i32 @test1(float %x) {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %xmm0, %eax ## encoding: [0x62,0xf1,0x7d,0x08,0x7e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = bitcast float %x to i32
ret i32 %res
}
-; CHECK-LABEL: @test2
-; CHECK: vmovd %edi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test2(i32 %x) {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <4 x i32>undef, i32 %x, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test3
-; CHECK: vmovq %rdi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <2 x i64> @test3(i64 %x) {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <2 x i64>undef, i64 %x, i32 0
ret <2 x i64>%res
}
-; CHECK-LABEL: @test4
-; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test4(i32* %x) {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x
%res = insertelement <4 x i32>undef, i32 %y, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test5
-; CHECK: vmovss %xmm0, (%rdi) ## encoding: [0x62
-; CHECK: ret
define void @test5(float %x, float* %y) {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
store float %x, float* %y, align 4
ret void
}
-; CHECK-LABEL: @test6
-; CHECK: vmovsd %xmm0, (%rdi) ## encoding: [0x62
-; CHECK: ret
define void @test6(double %x, double* %y) {
+; CHECK-LABEL: test6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsd %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xff,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
store double %x, double* %y, align 8
ret void
}
-; CHECK-LABEL: @test7
-; CHECK: vmovss (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define float @test7(i32* %x) {
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x10,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x
%res = bitcast i32 %y to float
ret float %res
}
-; CHECK-LABEL: @test8
-; CHECK: vmovd %xmm0, %eax ## encoding: [0x62
-; CHECK: ret
define i32 @test8(<4 x i32> %x) {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %xmm0, %eax ## encoding: [0x62,0xf1,0x7d,0x08,0x7e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = extractelement <4 x i32> %x, i32 0
ret i32 %res
}
-; CHECK-LABEL: @test9
-; CHECK: vmovq %xmm0, %rax ## encoding: [0x62
-; CHECK: ret
define i64 @test9(<2 x i64> %x) {
+; CHECK-LABEL: test9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %xmm0, %rax ## encoding: [0x62,0xf1,0xfd,0x08,0x7e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = extractelement <2 x i64> %x, i32 0
ret i64 %res
}
-; CHECK-LABEL: @test10
-; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test10(i32* %x) {
+; CHECK-LABEL: test10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x, align 4
%res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test11
-; CHECK: vmovss (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x float> @test11(float* %x) {
+; CHECK-LABEL: test11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovss (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x10,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load float, float* %x, align 4
%res = insertelement <4 x float>zeroinitializer, float %y, i32 0
ret <4 x float>%res
}
-; CHECK-LABEL: @test12
-; CHECK: vmovsd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <2 x double> @test12(double* %x) {
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x10,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load double, double* %x, align 8
%res = insertelement <2 x double>zeroinitializer, double %y, i32 0
ret <2 x double>%res
}
-; CHECK-LABEL: @test13
-; CHECK: vmovq %rdi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <2 x i64> @test13(i64 %x) {
+; CHECK-LABEL: test13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
ret <2 x i64>%res
}
-; CHECK-LABEL: @test14
-; CHECK: vmovd %edi, %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test14(i32 %x) {
+; CHECK-LABEL: test14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: @test15
-; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
-; CHECK: ret
define <4 x i32> @test15(i32* %x) {
+; CHECK-LABEL: test15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0x07]
+; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq ## encoding: [0xc3]
%y = load i32, i32* %x, align 4
%res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
ret <4 x i32>%res
}
-; CHECK-LABEL: test16
-; CHECK: vmovdqu32
-; CHECK: ret
define <16 x i32> @test16(i8 * %addr) {
+; CHECK-LABEL: test16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7e,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
%res = load <16 x i32>, <16 x i32>* %vaddr, align 1
ret <16 x i32>%res
}
-; CHECK-LABEL: test17
-; CHECK: vmovdqa32
-; CHECK: ret
define <16 x i32> @test17(i8 * %addr) {
+; CHECK-LABEL: test17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
%res = load <16 x i32>, <16 x i32>* %vaddr, align 64
ret <16 x i32>%res
}
-; CHECK-LABEL: test18
-; CHECK: vmovdqa64
-; CHECK: ret
define void @test18(i8 * %addr, <8 x i64> %data) {
+; CHECK-LABEL: test18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
store <8 x i64>%data, <8 x i64>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test19
-; CHECK: vmovdqu32
-; CHECK: ret
define void @test19(i8 * %addr, <16 x i32> %data) {
+; CHECK-LABEL: test19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
store <16 x i32>%data, <16 x i32>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test20
-; CHECK: vmovdqa32
-; CHECK: ret
define void @test20(i8 * %addr, <16 x i32> %data) {
+; CHECK-LABEL: test20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
store <16 x i32>%data, <16 x i32>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test21
-; CHECK: vmovdqa64
-; CHECK: ret
define <8 x i64> @test21(i8 * %addr) {
+; CHECK-LABEL: test21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
%res = load <8 x i64>, <8 x i64>* %vaddr, align 64
ret <8 x i64>%res
}
-; CHECK-LABEL: test22
-; CHECK: vmovdqu64
-; CHECK: ret
define void @test22(i8 * %addr, <8 x i64> %data) {
+; CHECK-LABEL: test22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
store <8 x i64>%data, <8 x i64>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test23
-; CHECK: vmovdqu64
-; CHECK: ret
define <8 x i64> @test23(i8 * %addr) {
+; CHECK-LABEL: test23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
%res = load <8 x i64>, <8 x i64>* %vaddr, align 1
ret <8 x i64>%res
}
-; CHECK-LABEL: test24
-; CHECK: vmovapd
-; CHECK: ret
define void @test24(i8 * %addr, <8 x double> %data) {
+; CHECK-LABEL: test24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
store <8 x double>%data, <8 x double>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test25
-; CHECK: vmovapd
-; CHECK: ret
define <8 x double> @test25(i8 * %addr) {
+; CHECK-LABEL: test25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
%res = load <8 x double>, <8 x double>* %vaddr, align 64
ret <8 x double>%res
}
-; CHECK-LABEL: test26
-; CHECK: vmovaps
-; CHECK: ret
define void @test26(i8 * %addr, <16 x float> %data) {
+; CHECK-LABEL: test26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
store <16 x float>%data, <16 x float>* %vaddr, align 64
ret void
}
-; CHECK-LABEL: test27
-; CHECK: vmovaps
-; CHECK: ret
define <16 x float> @test27(i8 * %addr) {
+; CHECK-LABEL: test27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
%res = load <16 x float>, <16 x float>* %vaddr, align 64
ret <16 x float>%res
}
-; CHECK-LABEL: test28
-; CHECK: vmovupd
-; CHECK: ret
define void @test28(i8 * %addr, <8 x double> %data) {
+; CHECK-LABEL: test28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
store <8 x double>%data, <8 x double>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test29
-; CHECK: vmovupd
-; CHECK: ret
define <8 x double> @test29(i8 * %addr) {
+; CHECK-LABEL: test29:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
%res = load <8 x double>, <8 x double>* %vaddr, align 1
ret <8 x double>%res
}
-; CHECK-LABEL: test30
-; CHECK: vmovups
-; CHECK: ret
define void @test30(i8 * %addr, <16 x float> %data) {
+; CHECK-LABEL: test30:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
store <16 x float>%data, <16 x float>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test31
-; CHECK: vmovups
-; CHECK: ret
define <16 x float> @test31(i8 * %addr) {
+; CHECK-LABEL: test31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
%res = load <16 x float>, <16 x float>* %vaddr, align 1
ret <16 x float>%res
}
-; CHECK-LABEL: test32
-; CHECK: vmovdqa32{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; CHECK-LABEL: test32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 64
@@ -281,10 +322,13 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test33
-; CHECK: vmovdqu32{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; CHECK-LABEL: test33:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 1
@@ -292,10 +336,13 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test34
-; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
+; CHECK-LABEL: test34:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 64
@@ -303,10 +350,13 @@ define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test35
-; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
+; CHECK-LABEL: test35:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
%r = load <16 x i32>, <16 x i32>* %vaddr, align 1
@@ -314,10 +364,13 @@ define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
ret <16 x i32>%res
}
-; CHECK-LABEL: test36
-; CHECK: vmovdqa64{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; CHECK-LABEL: test36:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 64
@@ -325,10 +378,13 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test37
-; CHECK: vmovdqu64{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; CHECK-LABEL: test37:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 1
@@ -336,10 +392,13 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test38
-; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
+; CHECK-LABEL: test38:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 64
@@ -347,10 +406,13 @@ define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test39
-; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
+; CHECK-LABEL: test39:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
%r = load <8 x i64>, <8 x i64>* %vaddr, align 1
@@ -358,10 +420,14 @@ define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
ret <8 x i64>%res
}
-; CHECK-LABEL: test40
-; CHECK: vmovaps{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; CHECK-LABEL: test40:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 64
@@ -369,10 +435,14 @@ define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
ret <16 x float>%res
}
-; CHECK-LABEL: test41
-; CHECK: vmovups{{.*{%k[1-7]} }}
-; CHECK: ret
define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; CHECK-LABEL: test41:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 1
@@ -380,10 +450,14 @@ define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
ret <16 x float>%res
}
-; CHECK-LABEL: test42
-; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
+; CHECK-LABEL: test42:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 64
@@ -391,10 +465,14 @@ define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
ret <16 x float>%res
}
-; CHECK-LABEL: test43
-; CHECK: vmovups{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
+; CHECK-LABEL: test43:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
%r = load <16 x float>, <16 x float>* %vaddr, align 1
@@ -402,10 +480,14 @@ define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
ret <16 x float>%res
}
-; CHECK-LABEL: test44
-; CHECK: vmovapd{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; CHECK-LABEL: test44:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 64
@@ -413,10 +495,14 @@ define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
ret <8 x double>%res
}
-; CHECK-LABEL: test45
-; CHECK: vmovupd{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; CHECK-LABEL: test45:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 1
@@ -424,10 +510,14 @@ define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
ret <8 x double>%res
}
-; CHECK-LABEL: test46
-; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
+; CHECK-LABEL: test46:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 64
@@ -435,10 +525,14 @@ define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
ret <8 x double>%res
}
-; CHECK-LABEL: test47
-; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) {
+; CHECK-LABEL: test47:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
%r = load <8 x double>, <8 x double>* %vaddr, align 1
diff --git a/test/CodeGen/X86/avx512-nontemporal.ll b/test/CodeGen/X86/avx512-nontemporal.ll
index bf57d021acab..adfaef25b7d3 100644
--- a/test/CodeGen/X86/avx512-nontemporal.ll
+++ b/test/CodeGen/X86/avx512-nontemporal.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s
-define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, i32 %D, <8 x i64> %E, <8 x i64> %EE) {
+define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH) {
; CHECK: vmovntps %z
%cast = bitcast i8* %B to <16 x float>*
%A2 = fadd <16 x float> %A, %AA
@@ -13,6 +13,18 @@ define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x
%cast2 = bitcast i8* %B to <8 x double>*
%C2 = fadd <8 x double> %C, %CC
store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+ %cast3 = bitcast i8* %B to <16 x i32>*
+ %F2 = add <16 x i32> %F, %FF
+ store <16 x i32> %F2, <16 x i32>* %cast3, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+ %cast4 = bitcast i8* %B to <32 x i16>*
+ %G2 = add <32 x i16> %G, %GG
+ store <32 x i16> %G2, <32 x i16>* %cast4, align 64, !nontemporal !0
+; CHECK: vmovntdq %z
+ %cast5 = bitcast i8* %B to <64 x i8>*
+ %H2 = add <64 x i8> %H, %HH
+ store <64 x i8> %H2, <64 x i8>* %cast5, align 64, !nontemporal !0
ret void
}
diff --git a/test/CodeGen/X86/avx512-scalarIntrinsics.ll b/test/CodeGen/X86/avx512-scalarIntrinsics.ll
new file mode 100644
index 000000000000..c26e1fb070fc
--- /dev/null
+++ b/test/CodeGen/X86/avx512-scalarIntrinsics.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+
+define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
+ ; CHECK-LABEL: test_rsqrt14_ss:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrsqrt14ss %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
+ ; CHECK-LABEL: test_rcp14_ss:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x double> @test_rsqrt14_sd(<2 x double> %a0) {
+ ; CHECK-LABEL: test_rsqrt14_sd:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrsqrt14sd %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ;
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_rcp14_sd(<2 x double> %a0) {
+ ; CHECK-LABEL: test_rcp14_sd:
+ ; CHECK: ## BB#0:
+ ; CHECK-NEXT: vrcp14sd %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ;
+ ret <2 x double> %res
+
+}
+declare <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
+define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+ ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
+ ; CHECK: vscalefss %xmm1, %xmm0, %xmm2 {%k1}
+ ; CHECK: vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
+define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+ ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
+ ; CHECK: vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
+ ; CHECK: vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
+ %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index b92e6f62813c..2ac91cc7482a 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -1,62 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; CHECK-LABEL: select00
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
+; CHECK-LABEL: select00:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB0_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: LBB0_2:
+; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
%res = xor <16 x i32> %b, %selres
ret <16 x i32> %res
}
-; CHECK-LABEL: select01
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
+; CHECK-LABEL: select01:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: cmpl $255, %edi
+; CHECK-NEXT: je LBB1_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: LBB1_2:
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b
%res = xor <8 x i64> %b, %selres
ret <8 x i64> %res
}
-; CHECK-LABEL: @select02
-; CHECK: cmpless %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovss %xmm2, {{.*}}%xmm1 {%k1}
-; CHECK: ret
define float @select02(float %a, float %b, float %c, float %eps) {
+; CHECK-LABEL: select02:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpless %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%cmp = fcmp oge float %a, %eps
%cond = select i1 %cmp, float %c, float %b
ret float %cond
}
-; CHECK-LABEL: @select03
-; CHECK: cmplesd %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovsd %xmm2, {{.*}}%xmm1 {%k1}
-; CHECK: ret
define double @select03(double %a, double %b, double %c, double %eps) {
+; CHECK-LABEL: select03:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplesd %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovsd %xmm2, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
%cmp = fcmp oge double %a, %eps
%cond = select i1 %cmp, double %c, double %b
ret double %cond
}
-; CHECK-LABEL: @select04
-; CHECK: vmovaps %zmm3, %zmm1
-; CHECK-NEXT: ret
-; PR20677
define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
+; CHECK-LABEL: select04:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %zmm3, %zmm1
+; CHECK-NEXT: retq
%sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
ret <16 x double> %sel
}
-; CHECK-LABEL: select05
-; CHECK: movzbl %sil, %eax
-; CHECK: kmovw %eax, %k0
-; CHECK: movzbl %dil, %eax
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
define i8 @select05(i8 %a.0, i8 %m) {
+; CHECK-LABEL: select05:
+; CHECK: ## BB#0:
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
@@ -64,14 +81,30 @@ define i8 @select05(i8 %a.0, i8 %m) {
ret i8 %res;
}
-; CHECK-LABEL: select06
-; CHECK: movzbl %sil, %eax
-; CHECK: kmovw %eax, %k0
-; CHECK: movzbl %dil, %eax
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
+define i8 @select05_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
+; CHECK-LABEL: select05_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl (%rsi), %eax
+; CHECK-NEXT: kmovw %eax, %k0
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: korw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %mask = load <8 x i1> , <8 x i1>* %m
+ %a = load <8 x i1> , <8 x i1>* %a.0
+ %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
+ %res = bitcast <8 x i1> %r to i8
+ ret i8 %res;
+}
+
define i8 @select06(i8 %a.0, i8 %m) {
+; CHECK-LABEL: select06:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
@@ -79,19 +112,36 @@ define i8 @select06(i8 %a.0, i8 %m) {
ret i8 %res;
}
-; CHECK-LABEL: select07
-; CHECK-DAG: movzbl %dl, %eax
-; CHECK-DAG: kmovw %eax, %k0
-; CHECK-DAG: movzbl %dil, %eax
-; CHECK-DAG: kmovw %eax, %k1
-; CHECK-DAG: movzbl %sil, %eax
-; CHECK-DAG: kmovw %eax, %k2
-; CHECK: kandw %k0, %k1, %k1
-; CHECK-NEXT: knotw %k0, %k0
-; CHECK-NEXT: kandw %k0, %k2, %k0
-; CHECK-NEXT: korw %k0, %k1, %k0
-; CHECK-NEXT: kmovw %k0, %eax
+define i8 @select06_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
+; CHECK-LABEL: select06_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl (%rsi), %eax
+; CHECK-NEXT: kmovw %eax, %k0
+; CHECK-NEXT: movzbl (%rdi), %eax
+; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kandw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %mask = load <8 x i1> , <8 x i1>* %m
+ %a = load <8 x i1> , <8 x i1>* %a.0
+ %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
+ %res = bitcast <8 x i1> %r to i8
+ ret i8 %res;
+}
define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
+; CHECK-LABEL: select07:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kmovw %esi, %k2
+; CHECK-NEXT: kandw %k0, %k1, %k1
+; CHECK-NEXT: knotw %k0, %k0
+; CHECK-NEXT: kandw %k0, %k2, %k0
+; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%b = bitcast i8 %b.0 to <8 x i1>
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index c54010cd91b9..c1d0a9a173e1 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -5,7 +5,7 @@ define <8 x i1> @test(<2 x i1> %a) {
; CHECK-LABEL: test:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -17,7 +17,7 @@ define <8 x i1> @test1(<2 x i1> %a) {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -29,12 +29,12 @@ define <8 x i1> @test2(<2 x i1> %a) {
; CHECK-LABEL: test2:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: vpmovm2q %k0, %zmm0
; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0
-; CHECK-NEXT: vpmovq2m %zmm0, %k0
+; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
%res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef>
@@ -45,7 +45,7 @@ define <8 x i1> @test3(<4 x i1> %a) {
; CHECK-LABEL: test3:
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
@@ -59,10 +59,13 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test4:
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k1
-; CHECK-NEXT: korb %k0, %k1, %k0
+; CHECK-NEXT: kshiftrb $4, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -74,10 +77,13 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test5:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
-; CHECK-NEXT: kshiftlw $2, %k0, %k0
-; CHECK-NEXT: kshiftrw $2, %k0, %k1
-; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $2, %k1, %k1
+; CHECK-NEXT: kshiftlb $2, %k0, %k0
+; CHECK-NEXT: kshiftrb $2, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: retq
@@ -89,10 +95,13 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test6:
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
-; CHECK-NEXT: kshiftlw $2, %k0, %k0
-; CHECK-NEXT: kshiftrw $2, %k0, %k1
-; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
+; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $2, %k1, %k1
+; CHECK-NEXT: kshiftlb $2, %k0, %k0
+; CHECK-NEXT: kshiftrb $2, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kunpckbw %k0, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: retq
@@ -105,10 +114,13 @@ define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test7:
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k1
-; CHECK-NEXT: korb %k0, %k1, %k0
+; CHECK-NEXT: kshiftrb $4, %k0, %k0
+; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kunpckbw %k0, %k0, %k0
; CHECK-NEXT: kunpckwd %k0, %k0, %k0
; CHECK-NEXT: vpmovm2b %k0, %ymm0
@@ -133,3 +145,26 @@ define <64 x i1> @test8(<8 x i1> %a, <8 x i1>%b) {
ret <64 x i1> %res
}
+define <4 x i1> @test9(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: test9:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT: vpmovw2m %xmm0, %k0
+; CHECK-NEXT: kshiftrw $4, %k0, %k0
+; CHECK-NEXT: vpmovm2d %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i1> %a, <8 x i1> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x i1> %res
+}
+
+define <2 x i1> @test10(<4 x i1> %a, <4 x i1> %b) {
+; CHECK-LABEL: test10:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT: kshiftrw $2, %k0, %k0
+; CHECK-NEXT: vpmovm2q %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i1> %a, <4 x i1> %b, <2 x i32> <i32 2, i32 3>
+ ret <2 x i1> %res
+}
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index e4e5c2b8a1d5..35be44140026 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -53,7 +53,9 @@ define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qb_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256:
@@ -67,6 +69,7 @@ define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
; KNL-LABEL: trunc_qb_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovd %xmm0, (%rdi)
@@ -128,7 +131,9 @@ define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qw_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256:
@@ -142,6 +147,7 @@ define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
; KNL-LABEL: trunc_qw_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; KNL-NEXT: vmovq %xmm0, (%rdi)
@@ -203,7 +209,9 @@ define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qd_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256:
@@ -217,6 +225,7 @@ define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
; KNL-LABEL: trunc_qd_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vmovaps %xmm0, (%rdi)
; KNL-NEXT: retq
@@ -276,7 +285,9 @@ define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
; KNL-LABEL: trunc_db_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256:
@@ -290,6 +301,7 @@ define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
; KNL-LABEL: trunc_db_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
@@ -350,7 +362,9 @@ define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
; KNL-LABEL: trunc_dw_256:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256:
@@ -364,6 +378,7 @@ define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
; KNL-LABEL: trunc_dw_256_mem:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vmovaps %xmm0, (%rdi)
; KNL-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/test/CodeGen/X86/avx512-unsafe-fp-math.ll
new file mode 100644
index 000000000000..1956b2f7eca9
--- /dev/null
+++ b/test/CodeGen/X86/avx512-unsafe-fp-math.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=CHECK_UNSAFE --check-prefix=AVX512F_UNSAFE
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+
+define <16 x float> @test_max_v16f32(<16 x float> * %a_ptr, <16 x float> %b) {
+; CHECK_UNSAFE-LABEL: test_max_v16f32:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vmaxps (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_max_v16f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <16 x float>, <16 x float>* %a_ptr
+ %tmp = fcmp fast ogt <16 x float> %a, %b
+ %tmp4 = select <16 x i1> %tmp, <16 x float> %a, <16 x float> %b
+ ret <16 x float> %tmp4;
+}
+
+define <16 x float> @test_min_v16f32(<16 x float>* %a_ptr, <16 x float> %b) {
+; CHECK_UNSAFE-LABEL: test_min_v16f32:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vminps (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_min_v16f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <16 x float>, <16 x float>* %a_ptr
+ %tmp = fcmp fast olt <16 x float> %a, %b
+ %tmp4 = select <16 x i1> %tmp, <16 x float> %a, <16 x float> %b
+ ret <16 x float> %tmp4;
+}
+
+define <8 x double> @test_max_v8f64(<8 x double> * %a_ptr, <8 x double> %b) {
+; CHECK_UNSAFE-LABEL: test_max_v8f64:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vmaxpd (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_max_v8f64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a_ptr
+ %tmp = fcmp fast ogt <8 x double> %a, %b
+ %tmp4 = select <8 x i1> %tmp, <8 x double> %a, <8 x double> %b
+ ret <8 x double> %tmp4;
+}
+
+define <8 x double> @test_min_v8f64(<8 x double>* %a_ptr, <8 x double> %b) {
+; CHECK_UNSAFE-LABEL: test_min_v8f64:
+; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE-NEXT: vminpd (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_min_v8f64:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a_ptr
+ %tmp = fcmp fast olt <8 x double> %a, %b
+ %tmp4 = select <8 x i1> %tmp, <8 x double> %a, <8 x double> %b
+ ret <8 x double> %tmp4;
+}
+
+define float @test_min_f32(float %a, float* %ptr) {
+; CHECK_UNSAFE-LABEL: test_min_f32:
+; CHECK_UNSAFE: # BB#0: # %entry
+; CHECK_UNSAFE-NEXT: vminss (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_min_f32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: vminss %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = load float, float* %ptr
+ %1 = fcmp fast olt float %0, %a
+ %2 = select i1 %1, float %0, float %a
+ ret float %2
+}
+
+define double @test_max_f64(double %a, double* %ptr) {
+; CHECK_UNSAFE-LABEL: test_max_f64:
+; CHECK_UNSAFE: # BB#0: # %entry
+; CHECK_UNSAFE-NEXT: vmaxsd (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT: retq
+;
+; CHECK-LABEL: test_max_f64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = load double, double* %ptr
+ %1 = fcmp fast ogt double %0, %a
+ %2 = select i1 %1, double %0, double %a
+ ret double %2
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 4f679f9aca6f..005dc23ccf7b 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -124,6 +124,7 @@ define <8 x double> @_inreg8xdouble(double %a) {
define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_mask:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
; ALL-NEXT: vpxor %ymm3, %ymm3, %ymm3
; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
@@ -139,6 +140,7 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m
define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_maskz:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
@@ -164,6 +166,7 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) {
define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_mask_load:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
@@ -179,6 +182,7 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8
define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_maskz_load:
; ALL: # BB#0:
+; ALL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; ALL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
@@ -214,9 +218,10 @@ define <16 x i32> @test_vbroadcast() {
; ALL: # BB#0: # %entry
; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0
; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1
-; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: knotw %k1, %k1
-; ALL-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
entry:
%0 = sext <16 x i1> zeroinitializer to <16 x i32>
@@ -398,3 +403,42 @@ define <8 x i64> @_invec4xi64(<4 x i64>%a) {
ret <8 x i64>%res
}
+declare void @func_f32(float)
+define <16 x float> @broadcast_ss_spill(float %x) {
+; ALL-LABEL: broadcast_ss_spill:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rax
+; ALL-NEXT: .Ltmp0:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; ALL-NEXT: callq func_f32
+; ALL-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %zmm0 # 4-byte Folded Reload
+; ALL-NEXT: popq %rax
+; ALL-NEXT: retq
+ %a = fadd float %x, %x
+ call void @func_f32(float %a)
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %c
+}
+
+declare void @func_f64(double)
+define <8 x double> @broadcast_sd_spill(double %x) {
+; ALL-LABEL: broadcast_sd_spill:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rax
+; ALL-NEXT: .Ltmp1:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill
+; ALL-NEXT: callq func_f64
+; ALL-NEXT: vbroadcastsd (%rsp), %zmm0 # 8-byte Folded Reload
+; ALL-NEXT: popq %rax
+; ALL-NEXT: retq
+ %a = fadd double %x, %x
+ call void @func_f64(double %a)
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double> %c
+}
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index a8c558df9de8..5bda3bd173da 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1,37 +1,35 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=SKX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
-; KNL-LABEL: test1:
-; KNL: ## BB#0:
-; KNL-NEXT: vcmpleps %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
}
define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
-; KNL-LABEL: test2:
-; KNL: ## BB#0:
-; KNL-NEXT: vcmplepd %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
}
define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
-; KNL-LABEL: test3:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -39,36 +37,33 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
}
define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
-; KNL-LABEL: test4_unsigned:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test4_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
}
define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
-; KNL-LABEL: test5:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
}
define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) nounwind {
-; KNL-LABEL: test6_unsigned:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
-; KNL-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test6_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
@@ -81,13 +76,13 @@ define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+;
; SKX-LABEL: test7:
; SKX: ## BB#0:
-; SKX: vxorps %xmm2, %xmm2, %xmm2
-; SKX: vcmpltps %xmm2, %xmm0, %k1
-; SKX: vmovaps %xmm0, %xmm1 {%k1}
-; SKX: vmovaps %zmm1, %zmm0
-; SKX: retq
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp olt <4 x float> %a, zeroinitializer
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
@@ -101,13 +96,13 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
+;
; SKX-LABEL: test8:
; SKX: ## BB#0:
-; SKX: vxorpd %xmm2, %xmm2, %xmm2
-; SKX: vcmpltpd %xmm2, %xmm0, %k1
-; SKX: vmovapd %xmm0, %xmm1 {%k1}
-; SKX: vmovaps %zmm1, %zmm0
-; SKX: retq
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp olt <2 x double> %a, zeroinitializer
%c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
ret <2 x double>%c
@@ -116,9 +111,18 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
; KNL-LABEL: test9:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+;
+; SKX-LABEL: test9:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -127,15 +131,18 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
; KNL-LABEL: test10:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
+;
; SKX-LABEL: test10:
; SKX: ## BB#0:
-; SKX: vcmpeqps %ymm1, %ymm0, %k1
-; SKX: vmovaps %ymm0, %ymm1 {%k1}
-; SKX: vmovaps %zmm1, %zmm0
-; SKX: retq
+; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -143,29 +150,179 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
}
define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
-; KNL-LABEL: test11_unsigned:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test11_unsigned:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
}
define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
-; KNL-LABEL: test12:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
-; KNL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
-; KNL-NEXT: kunpckbw %k0, %k1, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: retq
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: kunpckbw %k0, %k1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
}
define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
+; KNL-LABEL: test12_v32i32:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $32, %rsp
+; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
; SKX-LABEL: test12_v32i32:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
@@ -179,6 +336,308 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
}
define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
+; KNL-LABEL: test12_v64i16:
+; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $64, %rsp
+; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm1
+; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT: kshiftlw $0, %k0, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl (%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: retq
+;
; SKX-LABEL: test12_v64i16:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k0
@@ -192,11 +651,11 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
}
define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
-; KNL-LABEL: test13:
-; KNL: ## BB#0:
-; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; CHECK-LABEL: test13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
{
%cmpvector_i = fcmp oeq <16 x float> %a, %b
%conv = zext <16 x i1> %cmpvector_i to <16 x i32>
@@ -204,14 +663,12 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
}
define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
-; KNL-LABEL: test14:
-; KNL: ## BB#0:
-; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm1
-; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
-; KNL-NEXT: knotw %k0, %k0
-; KNL-NEXT: knotw %k0, %k1
-; KNL-NEXT: vmovdqu32 %zmm1, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; CHECK-LABEL: test14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm1
+; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
+; CHECK-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%sub_r = sub <16 x i32> %a, %b
%cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
%sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
@@ -221,14 +678,12 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
}
define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
-; KNL-LABEL: test15:
-; KNL: ## BB#0:
-; KNL-NEXT: vpsubq %zmm1, %zmm0, %zmm1
-; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
-; KNL-NEXT: knotw %k0, %k0
-; KNL-NEXT: knotw %k0, %k1
-; KNL-NEXT: vmovdqu64 %zmm1, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
+; CHECK-LABEL: test15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%sub_r = sub <8 x i64> %a, %b
%cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
%sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
@@ -238,24 +693,22 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
}
define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
-; KNL-LABEL: test16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled %zmm0, %zmm1, %k1
-; KNL-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
+; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
}
define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
-; KNL-LABEL: test17:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -263,12 +716,11 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
}
define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
-; KNL-LABEL: test18:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -276,12 +728,11 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
}
define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
-; KNL-LABEL: test19:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpleud (%rdi), %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -289,13 +740,12 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
}
define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind {
-; KNL-LABEL: test20:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -304,13 +754,12 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
}
define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind {
-; KNL-LABEL: test21:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpleq %zmm1, %zmm0, %k1
-; KNL-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; KNL-NEXT: vmovaps %zmm2, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -319,13 +768,12 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
}
define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
-; KNL-LABEL: test22:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
-; KNL-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <8 x i64> %x, %y
@@ -335,13 +783,12 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
}
define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
-; KNL-LABEL: test23:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled %zmm1, %zmm2, %k1
-; KNL-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask0 = icmp ule <16 x i32> %x, %y
@@ -351,12 +798,11 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
}
define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
-; KNL-LABEL: test24:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -366,12 +812,11 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
}
define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
-; KNL-LABEL: test25:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -381,13 +826,12 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
}
define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
-; KNL-LABEL: test26:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpled %zmm1, %zmm2, %k1
-; KNL-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -399,13 +843,12 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
}
define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
-; KNL-LABEL: test27:
-; KNL: ## BB#0:
-; KNL-NEXT: vpcmpleq %zmm1, %zmm2, %k1
-; KNL-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
-; KNL-NEXT: retq
+; CHECK-LABEL: test27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -416,11 +859,24 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
}
-; KNL-LABEL: test28
-; KNL: vpcmpgtq
-; KNL: vpcmpgtq
-; KNL: kxnorw
define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) {
+; KNL-LABEL: test28:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
+; KNL-NEXT: kxnorw %k1, %k0, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test28:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; SKX-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
+; SKX-NEXT: kxnorb %k1, %k0, %k0
+; SKX-NEXT: vpmovm2d %k0, %ymm0
+; SKX-NEXT: retq
%x_gt_y = icmp sgt <8 x i64> %x, %y
%x1_gt_y1 = icmp sgt <8 x i64> %x1, %y1
%res = icmp eq <8 x i1>%x_gt_y, %x1_gt_y1
@@ -428,11 +884,24 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1
ret <8 x i32> %resse
}
-; KNL-LABEL: test29
-; KNL: vpcmpgtd
-; KNL: vpcmpgtd
-; KNL: kxorw
define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) {
+; KNL-LABEL: test29:
+; KNL: ## BB#0:
+; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
+; KNL-NEXT: kxorw %k1, %k0, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test29:
+; SKX: ## BB#0:
+; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; SKX-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
+; SKX-NEXT: kxorw %k1, %k0, %k0
+; SKX-NEXT: vpmovm2b %k0, %xmm0
+; SKX-NEXT: retq
%x_gt_y = icmp sgt <16 x i32> %x, %y
%x1_gt_y1 = icmp sgt <16 x i32> %x1, %y1
%res = icmp ne <16 x i1>%x_gt_y, %x1_gt_y1
@@ -441,9 +910,17 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
}
define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
+; KNL-LABEL: test30:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
+; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: test30:
-; SKX: vcmpeqpd %ymm1, %ymm0, %k1
-; SKX: vmovapd %ymm0, %ymm1 {%k1}
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%mask = fcmp oeq <4 x double> %x, %y
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -451,9 +928,17 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
}
define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp) nounwind {
-; SKX-LABEL: test31:
-; SKX: vcmpltpd (%rdi), %xmm0, %k1
-; SKX: vmovapd %xmm0, %xmm1 {%k1}
+; KNL-LABEL: test31:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2
+; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test31:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%y = load <2 x double>, <2 x double>* %yp, align 4
%mask = fcmp olt <2 x double> %x, %y
@@ -462,9 +947,17 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
}
define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp) nounwind {
+; KNL-LABEL: test32:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2
+; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
; SKX-LABEL: test32:
-; SKX: vcmpltpd (%rdi), %ymm0, %k1
-; SKX: vmovapd %ymm0, %ymm1 {%k1}
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%y = load <4 x double>, <4 x double>* %yp, align 4
%mask = fcmp ogt <4 x double> %y, %x
@@ -473,9 +966,11 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
}
define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp) nounwind {
-; SKX-LABEL: test33:
-; SKX: vcmpltpd (%rdi), %zmm0, %k1
-; SKX: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test33:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -483,9 +978,17 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
}
define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) nounwind {
-; SKX-LABEL: test34:
-; SKX: vcmpltps (%rdi), %xmm0, %k1
-; SKX: vmovaps %xmm0, %xmm1 {%k1}
+; KNL-LABEL: test34:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2
+; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test34:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -493,9 +996,21 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
}
define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) nounwind {
+; KNL-LABEL: test35:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vmovups (%rdi), %ymm2
+; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
+; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
; SKX-LABEL: test35:
-; SKX: vcmpltps (%rdi), %ymm0, %k1
-; SKX: vmovaps %ymm0, %ymm1 {%k1}
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%y = load <8 x float>, <8 x float>* %yp, align 4
%mask = fcmp ogt <8 x float> %y, %x
@@ -504,9 +1019,11 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
}
define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp) nounwind {
-; SKX-LABEL: test36:
-; SKX: vcmpltps (%rdi), %zmm0, %k1
-; SKX: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test36:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -514,9 +1031,11 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
}
define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nounwind {
-; SKX-LABEL: test37:
-; SKX: vcmpltpd (%rdi){1to8}, %zmm0, %k1
-; SKX: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test37:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
+; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -528,28 +1047,46 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
}
define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nounwind {
-; SKX-LABEL: test38:
-; SKX: vcmpltpd (%rdi){1to4}, %ymm0, %k1
-; SKX: vmovapd %ymm0, %ymm1 {%k1}
+; KNL-LABEL: test38:
+; KNL: ## BB#0:
+; KNL-NEXT: vbroadcastsd (%rdi), %ymm2
+; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2
+; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test38:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
+; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> zeroinitializer
-
+
%mask = fcmp ogt <4 x double> %shuffle, %x
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %x1
ret <4 x double> %max
}
define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nounwind {
-; SKX-LABEL: test39:
-; SKX: vcmpltpd (%rdi){1to2}, %xmm0, %k1
-; SKX: vmovapd %xmm0, %xmm1 {%k1}
+; KNL-LABEL: test39:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
+; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test39:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
+; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
-
+
%mask = fcmp ogt <2 x double> %shuffle, %x
%max = select <2 x i1> %mask, <2 x double> %x, <2 x double> %x1
ret <2 x double> %max
@@ -557,59 +1094,161 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) nounwind {
-; SKX-LABEL: test40:
-; SKX: vcmpltps (%rdi){1to16}, %zmm0, %k1
-; SKX: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test40:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
+; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%a = load float, float* %ptr
%v = insertelement <16 x float> undef, float %a, i32 0
%shuffle = shufflevector <16 x float> %v, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
+
%mask = fcmp ogt <16 x float> %shuffle, %x
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
ret <16 x float> %max
}
define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) nounwind {
-; SKX-LABEL: test41:
-; SKX: vcmpltps (%rdi){1to8}, %ymm0, %k1
-; SKX: vmovaps %ymm0, %ymm1 {%k1}
+; KNL-LABEL: test41:
+; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: vbroadcastss (%rdi), %ymm2
+; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
+; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test41:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
+; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: retq
%a = load float, float* %ptr
%v = insertelement <8 x float> undef, float %a, i32 0
%shuffle = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-
+
%mask = fcmp ogt <8 x float> %shuffle, %x
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %x1
ret <8 x float> %max
}
define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) nounwind {
-; SKX-LABEL: test42:
-; SKX: vcmpltps (%rdi){1to4}, %xmm0, %k1
-; SKX: vmovaps %xmm0, %xmm1 {%k1}
-
+; KNL-LABEL: test42:
+; KNL: ## BB#0:
+; KNL-NEXT: vbroadcastss (%rdi), %xmm2
+; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
+; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test42:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
+; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: retq
+
%a = load float, float* %ptr
%v = insertelement <4 x float> undef, float %a, i32 0
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-
+
%mask = fcmp ogt <4 x float> %shuffle, %x
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
ret <4 x float> %max
}
define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x i1> %mask_in) nounwind {
-; SKX-LABEL: test43:
-; SKX: vpmovw2m %xmm2, %k1
-; SKX: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; SKX: vmovapd %zmm0, %zmm1 {%k1}
+; KNL-LABEL: test43:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovsxwq %xmm2, %zmm2
+; KNL-NEXT: vpsllq $63, %zmm2, %zmm2
+; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
+; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test43:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm2, %xmm2
+; SKX-NEXT: vpmovw2m %xmm2, %k1
+; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
+; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; SKX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
%shuffle = shufflevector <8 x double> %v, <8 x double> undef, <8 x i32> zeroinitializer
-
+
%mask_cmp = fcmp ogt <8 x double> %shuffle, %x
%mask = and <8 x i1> %mask_cmp, %mask_in
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
ret <8 x double> %max
}
+
+define <4 x i32> @test44(<4 x i16> %x, <4 x i16> %y) #0 {
+; KNL-LABEL: test44:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test44:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; SKX-NEXT: vpmovm2d %k0, %xmm0
+; SKX-NEXT: retq
+ %mask = icmp eq <4 x i16> %x, %y
+ %1 = sext <4 x i1> %mask to <4 x i32>
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 {
+; KNL-LABEL: test45:
+; KNL: ## BB#0:
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test45:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <2 x i16> %x, %y
+ %1 = zext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 {
+; KNL-LABEL: test46:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; KNL-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL-NEXT: vpsrad $31, %xmm0, %xmm1
+; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test46:
+; SKX: ## BB#0:
+; SKX-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = fcmp oeq <2 x float> %x, %y
+ %1 = zext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %1
+}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..50a9076163e8
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -0,0 +1,413 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
+
+define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %zmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %bc1 = bitcast i64* %a1 to <64 x i1>*
+ %arg1 = load <64 x i1>, <64 x i1>* %bc1
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <64 x i32> zeroinitializer
+ %res1 = select <64 x i1> %arg1, <64 x i8> %res0, <64 x i8> %arg0
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %bc0 = bitcast i64* %a0 to <64 x i1>*
+ %arg0 = load <64 x i1>, <64 x i1>* %bc0
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <64 x i32> zeroinitializer
+ %res1 = select <64 x i1> %arg0, <64 x i8> %res0, <64 x i8> zeroinitializer
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm512_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %zmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer
+ %res1 = bitcast <32 x i16> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm512_mask_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm512_maskz_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_bslli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_bslli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
+; X32-LABEL: test_mm512_bsrli_epi128:
+; X32: # BB#0:
+; X32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_bsrli_epi128:
+; X64: # BB#0:
+; X64-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+; TODO - improve support for i64 -> mmask64 on 32-bit targets
+define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast i64* %a1 to <64 x i1>*
+ %sel1 = load <64 x i1>, <64 x i1>* %arg1
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+ %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to <64 x i1>*
+ %sel0 = load <64 x i1>, <64 x i1>* %arg0
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+ %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %res1 = bitcast <32 x i16> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+ %res1 = bitcast <64 x i8> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
+ %arg1 = bitcast i64* %a1 to <64 x i1>*
+ %sel1 = load <64 x i1>, <64 x i1>* %arg1
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %arg3 = bitcast <8 x i64> %a3 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg2, <64 x i8> %arg3, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+ %res1 = select <64 x i1> %sel1, <64 x i8> %res0, <64 x i8> %arg0
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovq (%eax), %k1
+; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovq (%rdi), %k1
+; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X64-NEXT: retq
+ %arg0 = bitcast i64* %a0 to <64 x i1>*
+ %sel0 = load <64 x i1>, <64 x i1>* %arg0
+ %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
+ %arg2 = bitcast <8 x i64> %a2 to <64 x i8>
+ %res0 = shufflevector <64 x i8> %arg1, <64 x i8> %arg2, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+ %res1 = select <64 x i1> %sel0, <64 x i8> %res0, <64 x i8> zeroinitializer
+ %res2 = bitcast <64 x i8> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
+; X32-LABEL: test_mm512_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+ %res1 = bitcast <32 x i16> %res0 to <8 x i64>
+ ret <8 x i64> %res1
+}
+
+define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
+; X32-LABEL: test_mm512_mask_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
+; X64-NEXT: retq
+ %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %arg3 = bitcast <8 x i64> %a3 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg2, <32 x i16> %arg3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+ %res1 = select <32 x i1> %arg1, <32 x i16> %res0, <32 x i16> %arg0
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+; X32-LABEL: test_mm512_maskz_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
+ %arg2 = bitcast <8 x i64> %a2 to <32 x i16>
+ %res0 = shufflevector <32 x i16> %arg1, <32 x i16> %arg2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+ %res1 = select <32 x i1> %arg0, <32 x i16> %res0, <32 x i16> zeroinitializer
+ %res2 = bitcast <32 x i16> %res1 to <8 x i64>
+ ret <8 x i64> %res2
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..cb2f23e90f20
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -0,0 +1,538 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+
+declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)
+
+define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdx, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%eax)
+; AVX512F-32-NEXT: retl
+ call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
+ call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)
+
+define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edx, %k1
+; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%eax)
+; AVX512F-32-NEXT: retl
+ call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
+ call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
+ ret void
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: kmovd %edx, %k1
+; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1)
+ %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT: kmovq %rdx, %k1
+; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1)
+ %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psll_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psll_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
+; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
+ %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+define <8 x i64>@test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psll_load_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psll_load_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
+; AVX512F-32-NEXT: retl
+ %x0 = load <8 x i64>, <8 x i64> *%p0
+ %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
+ ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psrl_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psrl_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
+ %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) {
+; AVX512BW-LABEL: test_int_x86_avx512_psrl_load_dq_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_psrl_load_dq_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; AVX512F-32-NEXT: retl
+ %x0 = load <8 x i64>, <8 x i64> *%p0
+ %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
+ ret <8 x i64> %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
+; AVX512BW-LABEL: test_pcmpeq_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp0:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+ ret i64 %res
+}
+
+define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpeq_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp1:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+ ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_pcmpeq_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpeq_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpeq_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpeq_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
+; AVX512BW-LABEL: test_pcmpgt_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp2:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+ ret i64 %res
+}
+
+define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpgt_b:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_b:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .Ltmp3:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+ ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_pcmpgt_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_pcmpgt_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_pcmpgt_w:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_mask_pcmpgt_w:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
+
+declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 7cf6edafbcc8..b131befcf0a2 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2,178 +2,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
-define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
-; AVX512BW-LABEL: test_pcmpeq_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpeq_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp0:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
- ret i64 %res
-}
-
-define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpeq_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpeq_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp1:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
- ret i64 %res
-}
-
-declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
-
-define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_pcmpeq_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpeq_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpeq_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpeq_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
-
-define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
-; AVX512BW-LABEL: test_pcmpgt_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpgt_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp2:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
- ret i64 %res
-}
-
-define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpgt_b:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpgt_b:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp3:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
- ret i64 %res
-}
-
-declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
-
-define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_pcmpgt_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_pcmpgt_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpgt_w:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpgt_w:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
-
define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512BW-LABEL: test_cmp_b_512:
; AVX512BW: ## BB#0:
@@ -205,7 +33,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-LABEL: test_cmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp4:
+; AVX512F-32-NEXT: .Ltmp0:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
@@ -214,31 +42,31 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: vpcmpltb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -291,7 +119,7 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-LABEL: test_mask_cmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp5:
+; AVX512F-32-NEXT: .Ltmp1:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
@@ -303,31 +131,31 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: vpcmpltb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -381,7 +209,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-LABEL: test_ucmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp6:
+; AVX512F-32-NEXT: .Ltmp2:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: vpcmpequb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
@@ -390,31 +218,31 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnequb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -467,7 +295,7 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $68, %esp
-; AVX512F-32-NEXT: .Ltmp7:
+; AVX512F-32-NEXT: .Ltmp3:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
@@ -479,31 +307,31 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpunordub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnequb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpordub %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -822,43 +650,6 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
-declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly
-
-define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) {
-; AVX512BW-LABEL: test_x86_mask_blend_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_x86_mask_blend_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpblendmw %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1]
- ret <32 x i16> %res
-}
-declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly
-
-define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) {
-; AVX512BW-LABEL: test_x86_mask_blend_b_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_x86_mask_blend_b_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpblendmb %zmm1, %zmm0, %zmm0 {%k1}
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1]
- ret <64 x i8> %res
-}
-
define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
; AVX512BW: ## BB#0:
@@ -2510,138 +2301,6 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
ret <16 x i32> %res2
}
-declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31],zmm2[40],k1[40],zmm2[41],k1[41],zmm2[42],k1[42],zmm2[43],k1[43],zmm2[44],k1[44],zmm2[45],k1[45],zmm2[46],k1[46],zmm2[47],k1[47],zmm2[56],k1[56],zmm2[57],k1[57],zmm2[58],k1[58],zmm2[59],k1[59],zmm2[60],k1[60],zmm2[61],k1[61],zmm2[62],k1[62],zmm2[63],k1[63]
-; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
- %res2 = add <64 x i8> %res, %res1
- ret <64 x i8> %res2
-}
-
-declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[32],k1[32],zmm2[33],k1[33],zmm2[34],k1[34],zmm2[35],k1[35],zmm2[36],k1[36],zmm2[37],k1[37],zmm2[38],k1[38],zmm2[39],k1[39],zmm2[48],k1[48],zmm2[49],k1[49],zmm2[50],k1[50],zmm2[51],k1[51],zmm2[52],k1[52],zmm2[53],k1[53],zmm2[54],k1[54],zmm2[55],k1[55]
-; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
- %res2 = add <64 x i8> %res, %res1
- ret <64 x i8> %res2
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
-; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm2 = zmm2[4],k1[4],zmm2[5],k1[5],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[12],k1[12],zmm2[13],k1[13],zmm2[14],k1[14],zmm2[15],k1[15],zmm2[20],k1[20],zmm2[21],k1[21],zmm2[22],k1[22],zmm2[23],k1[23],zmm2[28],k1[28],zmm2[29],k1[29],zmm2[30],k1[30],zmm2[31],k1[31]
-; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
- %res2 = add <32 x i16> %res, %res1
- ret <32 x i16> %res2
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[2],k1[2],zmm2[3],k1[3],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[16],k1[16],zmm2[17],k1[17],zmm2[18],k1[18],zmm2[19],k1[19],zmm2[24],k1[24],zmm2[25],k1[25],zmm2[26],k1[26],zmm2[27],k1[27]
-; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
- %res2 = add <32 x i16> %res, %res1
- ret <32 x i16> %res2
-}
-
-declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; AVX512BW-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
- %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
- %res3 = add <64 x i8> %res, %res1
- %res4 = add <64 x i8> %res3, %res2
- ret <64 x i8> %res4
-}
-
declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
@@ -2672,49 +2331,6 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
ret <32 x i16> %res4
}
-declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
-
-define <8 x i64>@test_int_x86_avx512_mask_psll_dq_512(<8 x i64> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_dq_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpslldq $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpslldq $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_dq_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpslldq $8, %zmm0, %zmm1
-; AVX512F-32-NEXT: vpslldq $4, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
- %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
-
-define <8 x i64>@test_int_x86_avx512_mask_psrl_dq_512(<8 x i64> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpsrldq $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpsrldq $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_dq_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpsrldq $8, %zmm0, %zmm1
-; AVX512F-32-NEXT: vpsrldq $4, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
- %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
@@ -2773,7 +2389,7 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp8:
+; AVX512F-32-NEXT: .Ltmp4:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
@@ -2799,7 +2415,7 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Ltmp9:
+; AVX512F-32-NEXT: .Ltmp5:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
@@ -2879,6 +2495,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -2887,9 +2513,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
ret <32 x i16> %res4
}
-declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i32, <32 x i16>, i32)
-define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
@@ -2899,9 +2525,19 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i8 %x1, <
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
@@ -2919,6 +2555,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrlv32hi:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -2939,6 +2585,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -2947,9 +2603,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16>
ret <32 x i16> %res4
}
-declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i32, <32 x i16>, i32)
-define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_wi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
@@ -2959,49 +2615,19 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i8 %x1, <
; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
- %res3 = add <32 x i16> %res, %res1
- %res4 = add <32 x i16> %res3, %res2
- ret <32 x i16> %res4
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i8, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %esi, %k1
-; AVX512BW-NEXT: vpshufhw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpshufhw $3, %zmm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpshufhw $3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
- %res3 = add <32 x i16> %res, %res1
- %res4 = add <32 x i16> %res3, %res2
- ret <32 x i16> %res4
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i8, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %esi, %k1
-; AVX512BW-NEXT: vpshuflw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpshuflw $3, %zmm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpshuflw $3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_wi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
@@ -3019,6 +2645,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -3027,6 +2663,24 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
ret <32 x i16> %res4
}
+define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> <i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51>,
+ <32 x i16> <i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49, i16 1, i16 10, i16 35, i16 52, i16 69, i16 9, i16 16, i16 49>,
+ <32 x i16> zeroinitializer, i32 -1)
+ ret <32 x i16> %res
+}
+
declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
@@ -3039,6 +2693,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -3047,9 +2711,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16>
ret <32 x i16> %res4
}
-declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i8, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i32, <32 x i16>, i32)
-define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i8 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_wi_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %esi, %k1
@@ -3059,9 +2723,19 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i8 %x1, <
; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
- %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> zeroinitializer, i32 %x3)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i8 3, <32 x i16> %x2, i32 -1)
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_wi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
@@ -3079,6 +2753,16 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_psllv32hi:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -3092,13 +2776,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i3
define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpmovzxbw %ymm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpmovzxbw %ymm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpmovzxbw %ymm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm2 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
@@ -3107,23 +2801,256 @@ define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i1
ret <32 x i16> %res4
}
-
declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2 {%k1} {z}
-; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2 {%k1} {z}
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
%res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
%res3 = add <32 x i16> %res, %res1
%res4 = add <32 x i16> %res3, %res2
ret <32 x i16> %res4
-} \ No newline at end of file
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
+; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res3, %res2
+ ret <32 x i16> %res4
+}
+
+declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rcx
+; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $20, %esp
+; AVX512F-32-NEXT: .Ltmp6:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
+ %res2 = add i64 %res, %res1
+ ret i64 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %ecx
+; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %ecx
+; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)
+
+define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovq %k0, %rcx
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: subl $20, %esp
+; AVX512F-32-NEXT: .Ltmp7:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
+ %res2 = add i64 %res, %res1
+ ret i64 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)
+
+define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %ecx
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovd %k0, %ecx
+; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rsi, %k1
+; AVX512BW-NEXT: vpbroadcastb %dil, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastb %dil, %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpbroadcastb %dil, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT: vpbroadcastb %al, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpbroadcastb %al, %zmm0 {%k1}
+; AVX512F-32-NEXT: vpbroadcastb %al, %zmm2
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res2, %res3
+ ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: vpbroadcastw %di, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastw %di, %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpbroadcastw %di, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm0 {%k1}
+; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpbroadcastw %ax, %zmm2
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res2, %res3
+ ret <32 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll
index 0208011cf89d..619c42494e2d 100644
--- a/test/CodeGen/X86/avx512bw-mask-op.ll
+++ b/test/CodeGen/X86/avx512bw-mask-op.ll
@@ -1,6 +1,13 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
define i32 @mask32(i32 %x) {
+; CHECK-LABEL: mask32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k0
+; CHECK-NEXT: knotd %k0, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: retq
%m0 = bitcast i32 %x to <32 x i1>
%m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -8,14 +15,15 @@ define i32 @mask32(i32 %x) {
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <32 x i1> %m1 to i32
ret i32 %ret
-; CHECK-LABEL: mask32
-; CHECK: kmovd
-; CHECK-NEXT: knotd
-; CHECK-NEXT: kmovd
-; CHECK_NEXT: ret
}
define i64 @mask64(i64 %x) {
+; CHECK-LABEL: mask64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k0
+; CHECK-NEXT: knotq %k0, %k0
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: retq
%m0 = bitcast i64 %x to <64 x i1>
%m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -27,14 +35,15 @@ define i64 @mask64(i64 %x) {
i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <64 x i1> %m1 to i64
ret i64 %ret
-; CHECK-LABEL: mask64
-; CHECK: kmovq
-; CHECK-NEXT: knotq
-; CHECK-NEXT: kmovq
-; CHECK_NEXT: ret
}
define void @mask32_mem(i32* %ptr) {
+; CHECK-LABEL: mask32_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd (%rdi), %k0
+; CHECK-NEXT: knotd %k0, %k0
+; CHECK-NEXT: kmovd %k0, (%rdi)
+; CHECK-NEXT: retq
%x = load i32, i32* %ptr, align 4
%m0 = bitcast i32 %x to <32 x i1>
%m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -44,14 +53,15 @@ define void @mask32_mem(i32* %ptr) {
%ret = bitcast <32 x i1> %m1 to i32
store i32 %ret, i32* %ptr, align 4
ret void
-; CHECK-LABEL: mask32_mem
-; CHECK: kmovd ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
-; CHECK-NEXT: knotd
-; CHECK-NEXT: kmovd %k{{[0-7]}}, ([[ARG1]])
-; CHECK_NEXT: ret
}
define void @mask64_mem(i64* %ptr) {
+; CHECK-LABEL: mask64_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq (%rdi), %k0
+; CHECK-NEXT: knotq %k0, %k0
+; CHECK-NEXT: kmovq %k0, (%rdi)
+; CHECK-NEXT: retq
%x = load i64, i64* %ptr, align 4
%m0 = bitcast i64 %x to <64 x i1>
%m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
@@ -65,35 +75,78 @@ define void @mask64_mem(i64* %ptr) {
%ret = bitcast <64 x i1> %m1 to i64
store i64 %ret, i64* %ptr, align 4
ret void
-; CHECK-LABEL: mask64_mem
-; CHECK: kmovq ([[ARG1]]), %k{{[0-7]}}
-; CHECK-NEXT: knotq
-; CHECK-NEXT: kmovq %k{{[0-7]}}, ([[ARG1]])
-; CHECK_NEXT: ret
}
define i32 @mand32(i32 %x, i32 %y) {
+; CHECK-LABEL: mand32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: andl %esi, %eax
+; CHECK-NEXT: xorl %esi, %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%ma = bitcast i32 %x to <32 x i1>
%mb = bitcast i32 %y to <32 x i1>
%mc = and <32 x i1> %ma, %mb
%md = xor <32 x i1> %ma, %mb
%me = or <32 x i1> %mc, %md
%ret = bitcast <32 x i1> %me to i32
-; CHECK: kandd
-; CHECK: kxord
-; CHECK: kord
+ ret i32 %ret
+}
+
+define i32 @mand32_mem(<32 x i1>* %x, <32 x i1>* %y) {
+; CHECK-LABEL: mand32_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd (%rdi), %k0
+; CHECK-NEXT: kmovd (%rsi), %k1
+; CHECK-NEXT: kandd %k1, %k0, %k2
+; CHECK-NEXT: kxord %k1, %k0, %k0
+; CHECK-NEXT: kord %k0, %k2, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: retq
+ %ma = load <32 x i1>, <32 x i1>* %x
+ %mb = load <32 x i1>, <32 x i1>* %y
+ %mc = and <32 x i1> %ma, %mb
+ %md = xor <32 x i1> %ma, %mb
+ %me = or <32 x i1> %mc, %md
+ %ret = bitcast <32 x i1> %me to i32
ret i32 %ret
}
define i64 @mand64(i64 %x, i64 %y) {
+; CHECK-LABEL: mand64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: andq %rsi, %rax
+; CHECK-NEXT: xorq %rsi, %rdi
+; CHECK-NEXT: orq %rax, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%ma = bitcast i64 %x to <64 x i1>
%mb = bitcast i64 %y to <64 x i1>
%mc = and <64 x i1> %ma, %mb
%md = xor <64 x i1> %ma, %mb
%me = or <64 x i1> %mc, %md
%ret = bitcast <64 x i1> %me to i64
-; CHECK: kandq
-; CHECK: kxorq
-; CHECK: korq
+ ret i64 %ret
+}
+
+define i64 @mand64_mem(<64 x i1>* %x, <64 x i1>* %y) {
+; CHECK-LABEL: mand64_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq (%rdi), %k0
+; CHECK-NEXT: kmovq (%rsi), %k1
+; CHECK-NEXT: kandq %k1, %k0, %k2
+; CHECK-NEXT: kxorq %k1, %k0, %k0
+; CHECK-NEXT: korq %k0, %k2, %k0
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: retq
+ %ma = load <64 x i1>, <64 x i1>* %x
+ %mb = load <64 x i1>, <64 x i1>* %y
+ %mc = and <64 x i1> %ma, %mb
+ %md = xor <64 x i1> %ma, %mb
+ %me = or <64 x i1> %mc, %md
+ %ret = bitcast <64 x i1> %me to i64
ret i64 %ret
}
diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
index 519b649ff53a..c58b3cc8c3cd 100644
--- a/test/CodeGen/X86/avx512bw-mov.ll
+++ b/test/CodeGen/X86/avx512bw-mov.ll
@@ -1,27 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
-; CHECK-LABEL: test1
-; CHECK: vmovdqu8
-; CHECK: ret
define <64 x i8> @test1(i8 * %addr) {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*
%res = load <64 x i8>, <64 x i8>* %vaddr, align 1
ret <64 x i8>%res
}
-; CHECK-LABEL: test2
-; CHECK: vmovdqu8
-; CHECK: ret
define void @test2(i8 * %addr, <64 x i8> %data) {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 %zmm0, (%rdi)
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*
store <64 x i8>%data, <64 x i8>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test3
-; CHECK: vmovdqu8{{.*{%k[1-7]}}}
-; CHECK: ret
define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpblendmb (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ne <64 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <64 x i8>*
%r = load <64 x i8>, <64 x i8>* %vaddr, align 1
@@ -29,10 +35,13 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
ret <64 x i8>%res
}
-; CHECK-LABEL: test4
-; CHECK: vmovdqu8{{.*{%k[1-7]} {z}}}
-; CHECK: ret
define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%mask = icmp ne <64 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <64 x i8>*
%r = load <64 x i8>, <64 x i8>* %vaddr, align 1
@@ -40,28 +49,33 @@ define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
ret <64 x i8>%res
}
-; CHECK-LABEL: test5
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test5(i8 * %addr) {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*
%res = load <32 x i16>, <32 x i16>* %vaddr, align 1
ret <32 x i16>%res
}
-; CHECK-LABEL: test6
-; CHECK: vmovdqu16
-; CHECK: ret
define void @test6(i8 * %addr, <32 x i16> %data) {
+; CHECK-LABEL: test6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 %zmm0, (%rdi)
+; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*
store <32 x i16>%data, <32 x i16>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test7
-; CHECK: vmovdqu16{{.*{%k[1-7]}}}
-; CHECK: ret
define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpcmpneqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpblendmw (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ne <32 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i16>*
%r = load <32 x i16>, <32 x i16>* %vaddr, align 1
@@ -69,13 +83,136 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
ret <32 x i16>%res
}
-; CHECK-LABEL: test8
-; CHECK: vmovdqu16{{.*{%k[1-7]} {z}}}
-; CHECK: ret
define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
%mask = icmp ne <32 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i16>*
%r = load <32 x i16>, <32 x i16>* %vaddr, align 1
%res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer
ret <32 x i16>%res
}
+
+define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; CHECK-LABEL: test_mask_load_16xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $48, %k0, %k0
+; CHECK-NEXT: kshiftrq $48, %k0, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+
+define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; CHECK-LABEL: test_mask_load_32xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $32, %k0, %k0
+; CHECK-NEXT: kshiftrq $32, %k0, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
+
+define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; CHECK-LABEL: test_mask_load_8xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT: vpmovw2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $24, %k0, %k0
+; CHECK-NEXT: kshiftrd $24, %k0, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+
+define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; CHECK-LABEL: test_mask_load_16xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $16, %k0, %k0
+; CHECK-NEXT: kshiftrd $16, %k0, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
+
+define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; CHECK-LABEL: test_mask_store_16xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $48, %k0, %k0
+; CHECK-NEXT: kshiftrq $48, %k0, %k1
+; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+
+define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; CHECK-LABEL: test_mask_store_32xi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftlq $32, %k0, %k0
+; CHECK-NEXT: kshiftrq $32, %k0, %k1
+; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
+
+define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; CHECK-LABEL: test_mask_store_8xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT: vpmovw2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $24, %k0, %k0
+; CHECK-NEXT: kshiftrd $24, %k0, %k1
+; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+
+define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; CHECK-LABEL: test_mask_store_16xi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpmovb2m %zmm0, %k0
+; CHECK-NEXT: kshiftld $16, %k0, %k0
+; CHECK-NEXT: kshiftrd $16, %k0, %k1
+; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
index 141f5cc09219..016837e61307 100644
--- a/test/CodeGen/X86/avx512bw-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -1,94 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; CHECK-LABEL: test1
-; CHECK: vpcmpeqb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: test2
-; CHECK: vpcmpgtb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: @test3
-; CHECK: vpcmplew {{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1
+; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
ret <32 x i16> %max
}
-; CHECK-LABEL: test4
-; CHECK: vpcmpnleub {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: test5
-; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %yp, align 4
%mask = icmp eq <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test6
-; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sgt <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test7
-; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sle <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test8
-; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp ule <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
ret <32 x i16> %max
}
-; CHECK-LABEL: @test9
-; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+; CHECK-LABEL: test9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <32 x i16> %x1, %y1
%mask0 = icmp eq <32 x i16> %x, %y
%mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
@@ -96,11 +107,13 @@ define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16
ret <32 x i16> %max
}
-; CHECK-LABEL: @test10
-; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+; CHECK-LABEL: test10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <64 x i8> %x1, %y1
%mask0 = icmp sle <64 x i8> %x, %y
%mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
@@ -108,11 +121,13 @@ define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y
ret <64 x i8> %max
}
-; CHECK-LABEL: @test11
-; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+; CHECK-LABEL: test11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <64 x i8> %x1, %y1
%y = load <64 x i8>, <64 x i8>* %y.ptr, align 4
%mask0 = icmp sgt <64 x i8> %x, %y
@@ -121,11 +136,13 @@ define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i
ret <64 x i8> %max
}
-; CHECK-LABEL: @test12
-; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+; CHECK-LABEL: test12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1
+; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i16> %x1, %y1
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask0 = icmp ule <32 x i16> %x, %y
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..7cd0da9564ff
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlbw-builtins.c
+
+define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res1 = bitcast <16 x i8> %res0 to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x i8> %res0, <16 x i8> %arg0
+ %res2 = bitcast <16 x i8> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x i8> %res0, <16 x i8> zeroinitializer
+ %res2 = bitcast <16 x i8> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_broadcastb_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastb %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastb %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <32 x i32> zeroinitializer
+ %res1 = bitcast <32 x i8> %res0 to <4 x i64>
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_mask_broadcastb_epi8(<4 x i64> %a0, i32 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
+ %arg1 = bitcast i32 %a1 to <32 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg2, <16 x i8> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg1, <32 x i8> %res0, <32 x i8> %arg0
+ %res2 = bitcast <32 x i8> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastb_epi8(i32 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastb_epi8:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastb_epi8:
+; X64: # BB#0:
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i32 %a0 to <32 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res0 = shufflevector <16 x i8> %arg1, <16 x i8> undef, <32 x i32> zeroinitializer
+ %res1 = select <32 x i1> %arg0, <32 x i8> %res0, <32 x i8> zeroinitializer
+ %res2 = bitcast <32 x i8> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res1 = bitcast <8 x i16> %res0 to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_mask_broadcastw_epi16(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x i16> %res0, <8 x i16> %arg0
+ %res2 = bitcast <8 x i16> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_mm_maskz_broadcastw_epi16(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x i16> %res0, <8 x i16> zeroinitializer
+ %res2 = bitcast <8 x i16> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_broadcastw_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastw %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastw %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <16 x i32> zeroinitializer
+ %res1 = bitcast <16 x i16> %res0 to <4 x i64>
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
+ %arg1 = bitcast i16 %a1 to <16 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg2, <8 x i16> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg1, <16 x i16> %res0, <16 x i16> %arg0
+ %res2 = bitcast <16 x i16> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastw_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastw_epi16:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i16 %a0 to <16 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res0 = shufflevector <8 x i16> %arg1, <8 x i16> undef, <16 x i32> zeroinitializer
+ %res1 = select <16 x i1> %arg0, <16 x i16> %res0, <16 x i16> zeroinitializer
+ %res2 = bitcast <16 x i16> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..9373561ea3ae
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -0,0 +1,629 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x78,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
+; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc9]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
+ %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
+ %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res2, %res3
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x78,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc9]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
+ %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res2, %res3
+ ret <16 x i8> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x79,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
+; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc9]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
+ %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x79,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc9]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res2, %res3
+ ret <8 x i16> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
+; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
+; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
+; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
+ %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
+ %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res2, %res3
+ ret <64 x i8> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
+; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
+ %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res2, %res3
+ ret <32 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16)
+
+define void@test_int_x86_avx512_mask_storeu_b_128(i8* %ptr1, i8* %ptr2, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu8 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7f,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr1, <16 x i8> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr2, <16 x i8> %x1, i16 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.b.256(i8*, <32 x i8>, i32)
+
+define void@test_int_x86_avx512_mask_storeu_b_256(i8* %ptr1, i8* %ptr2, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu8 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7f,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2)
+ call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.128(i8*, <8 x i16>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_w_128(i8* %ptr1, i8* %ptr2, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu16 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xff,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr1, <8 x i16> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr2, <8 x i16> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.256(i8*, <16 x i16>, i16)
+
+define void@test_int_x86_avx512_mask_storeu_w_256(i8* %ptr1, i8* %ptr2, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu16 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xff,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2)
+ call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1)
+ ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> %x1, i8 -1)
+ %res = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xff,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> %x1, i16 -1)
+ %res = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> %x1, i16 -1)
+ %res = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
+; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res0 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> %x1, i32 -1)
+ %res = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x0f,0xd9,0x02]
+; CHECK-NEXT: ## xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
+; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x0f,0xd9,0x02]
+; CHECK-NEXT: ## ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
+; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i32, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x70,0xd0,0x03]
+; CHECK-NEXT: ## xmm2 = xmm0[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i32, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x70,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
+; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i32, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7f,0x08,0x70,0xd0,0x03]
+; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0,4,5,6,7]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7]
+; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i32, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7f,0x28,0x70,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
+; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+ ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+ ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
+
+declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 4cbb9ba6c56a..534d5c85f008 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -1,124 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
; 256-bit
-define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: test_pcmpeq_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
-
-define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_pcmpeq_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
-
-define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-LABEL: test_pcmpgt_b_256
-; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
- ret i32 %res
-}
-
-define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_b_256
-; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
- ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
-
-define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: test_pcmpgt_w_256
-; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
- ret i16 %res
-}
-
-define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_w_256
-; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
- ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
-
define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
-; CHECK_LABEL: test_cmp_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
%res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
%res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
%res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
%res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
%res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
%res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
%res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
}
define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_cmp_b_256
-; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
@@ -127,58 +103,97 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
-; CHECK_LABEL: test_ucmp_b_256
-; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpnequb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
%res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
%res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
%res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
%res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
%res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
%res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
}
define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
-; CHECK_LABEL: test_mask_ucmp_b_256
-; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x00]
+; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
+; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
+; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
+; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x03]
+; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
+; CHECK-NEXT: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %edi ## encoding: [0xc5,0xfb,0x93,0xf8]
+; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x07]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %edi, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc7]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
+; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x03]
+; CHECK-NEXT: vmovd %r8d, %xmm1 ## encoding: [0x62,0xd1,0x7d,0x08,0x6e,0xc8]
+; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
-; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
%vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
-; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
%vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
-; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
%vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
-; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
%vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
-; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
%vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
-; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
%vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
-; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
%vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
ret <8 x i32> %vec7
@@ -187,58 +202,95 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK_LABEL: test_cmp_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x01]
+; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnlew %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_cmp_w_256
-; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd1,0x01]
+; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordw %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnlew %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -247,58 +299,95 @@ define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask)
declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
-; CHECK_LABEL: test_ucmp_w_256
-; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_ucmp_w_256
-; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduw %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequw %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -309,15 +398,24 @@ declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) n
; 128-bit
define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_pcmpeq_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpeq_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
ret i16 %res
}
define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpeq_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
ret i16 %res
}
@@ -325,15 +423,24 @@ define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_pcmpeq_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpeq_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpeq_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
ret i8 %res
}
@@ -341,15 +448,24 @@ define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: test_pcmpgt_b_128
-; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpgt_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
ret i16 %res
}
define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_b_128
-; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpgt_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
ret i16 %res
}
@@ -357,15 +473,24 @@ define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_pcmpgt_w_128
-; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_pcmpgt_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
ret i8 %res
}
define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_w_128
-; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_pcmpgt_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
ret i8 %res
}
@@ -373,58 +498,95 @@ define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK_LABEL: test_cmp_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x01]
+; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordb %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleb %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordb %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_cmp_b_128
-; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltb %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd1,0x01]
+; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordb %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleb %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -433,58 +595,95 @@ define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK_LABEL: test_ucmp_b_128
-; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordub %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequb %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf1,0x06]
+; CHECK-NEXT: vpcmpordub %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
}
define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
-; CHECK_LABEL: test_mask_ucmp_b_128
-; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordub %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequb %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xf9,0x06]
+; CHECK-NEXT: vpcmpordub %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vmovd %ecx, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6e,0xc1]
+; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x01]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x02]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x03]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x04]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x05]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x06]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xc4,0xc0,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
%vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
%vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
%vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
%vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
%vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
%vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
%vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
ret <8 x i16> %vec7
@@ -493,58 +692,95 @@ define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask)
declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK_LABEL: test_cmp_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x01]
+; CHECK-NEXT: vpcmplew %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltw %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnlew %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_cmp_w_128
-; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltw %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd1,0x01]
+; CHECK-NEXT: vpcmplew %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordw %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltw %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnlew %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -553,58 +789,95 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK_LABEL: test_ucmp_w_128
-; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuw %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuw %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
-; CHECK_LABEL: test_mask_ucmp_w_128
-; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuw %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduw %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequw %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuw %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuw %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -615,8 +888,11 @@ declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounw
declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd256_ps
- ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd256_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
@@ -624,8 +900,11 @@ define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps
- ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -634,7 +913,10 @@ declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x doub
define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd256_pd:
-; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
ret <4 x double> %res
}
@@ -643,7 +925,10 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x doub
define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd128_pd:
-; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
ret <2 x double> %res
}
@@ -651,13 +936,12 @@ define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2
define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -669,13 +953,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -687,13 +970,12 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -703,13 +985,12 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2
define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -721,13 +1002,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -739,13 +1019,12 @@ declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xda]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -755,13 +1034,12 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -773,13 +1051,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -791,13 +1068,12 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -807,13 +1083,12 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x
define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -825,13 +1100,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd9]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -843,13 +1117,12 @@ declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xda]
+; CHECK-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa8,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -862,13 +1135,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd9]
+; CHECK-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xaa,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -881,13 +1153,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd9]
+; CHECK-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xaa,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -899,13 +1170,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd9]
+; CHECK-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xaa,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -917,13 +1187,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd9]
+; CHECK-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xaa,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -933,8 +1202,11 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x
declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd256_ps
- ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd256_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
@@ -942,8 +1214,11 @@ define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd128_ps
- ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd128_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -951,8 +1226,11 @@ define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4
declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd256_pd
- ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd256_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
@@ -960,8 +1238,11 @@ define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1,
declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmadd128_pd
- ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
+; CHECK-LABEL: test_mask_vfnmadd128_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
@@ -969,8 +1250,11 @@ define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1,
declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub256_ps
- ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub256_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
ret <8 x float> %res
}
@@ -978,8 +1262,11 @@ define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8
declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub128_ps
- ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub128_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
@@ -987,8 +1274,11 @@ define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4
declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub256_pd
- ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub256_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
@@ -996,8 +1286,11 @@ define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1,
declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfnmsub128_pd
- ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
+; CHECK-LABEL: test_mask_vfnmsub128_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
@@ -1006,13 +1299,12 @@ define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1,
define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xda]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1024,13 +1316,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x do
define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1040,13 +1331,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <
define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xda]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1058,13 +1348,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x do
define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1074,13 +1363,12 @@ define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <
define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xda]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1092,13 +1380,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x floa
define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xae,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1108,13 +1395,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4
define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xda]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1126,13 +1412,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x floa
define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd9]
+; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xae,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1142,13 +1427,12 @@ define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8
define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xda]
+; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xac,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1158,13 +1442,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2
define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xda]
+; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xac,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1174,13 +1457,12 @@ define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4
define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xda]
+; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xac,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1190,13 +1472,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x
define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xda]
+; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xac,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1207,7 +1488,10 @@ declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x flo
define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub256_ps:
-; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
ret <8 x float> %res
}
@@ -1216,7 +1500,10 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x flo
define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub128_ps:
-; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
ret <4 x float> %res
}
@@ -1224,8 +1511,11 @@ define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4
declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmaddsub256_pd
- ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
+; CHECK-LABEL: test_mask_vfmaddsub256_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
@@ -1233,8 +1523,11 @@ define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a
declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmaddsub128_pd
- ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
+; CHECK-LABEL: test_mask_vfmaddsub128_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
@@ -1242,13 +1535,12 @@ define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a
define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1260,13 +1552,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1278,13 +1569,12 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1294,13 +1584,12 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0,
define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1312,13 +1601,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1330,13 +1618,12 @@ declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1346,13 +1633,12 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0,
define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1364,13 +1650,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1382,13 +1667,12 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa6,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1398,13 +1682,12 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <
define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1416,13 +1699,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd9]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1434,13 +1716,12 @@ declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xda]
+; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa6,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1452,13 +1733,12 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa7,0xc2]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2=fadd <2 x double> %res, %res1
@@ -1470,13 +1750,12 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa7,0xc2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2=fadd <4 x double> %res, %res1
@@ -1488,13 +1767,12 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm2, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa7,0xc2]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2=fadd <4 x float> %res, %res1
@@ -1506,13 +1784,12 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm2, %zmm3
-; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1}
-; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xda]
+; CHECK-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd9]
+; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0xa7,0xc2]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2=fadd <8 x float> %res, %res1
@@ -1521,54 +1798,72 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <
define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_r
- ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_ps_r:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rz
- ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmka:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
- ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
ret <4 x float> %res
}
define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmb:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1579,8 +1874,11 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
}
define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmba:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1591,8 +1889,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
}
define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1603,8 +1903,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
}
define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza
- ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2, align 4
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
@@ -1615,104 +1917,142 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
}
define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_r
- ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_pd_r:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_rz
- ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd128_pd_rz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
ret <2 x double> %res
}
define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
- ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_pd_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
ret <2 x double> %res
}
define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
- ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
ret <2 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_r
- ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd256_pd_r:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_rz
- ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-LABEL: test_mask_vfmadd256_pd_rz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
ret <4 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
- ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd256_pd_rmk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
ret <4 x double> %res
}
define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
- ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
- ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
+; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
ret <4 x double> %res
}
define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_add_epi16_rr_128
- ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrk_128
- ;CHECK: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
+; CHECK-LABEL: test_mask_add_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrkz_128
- ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi16_rm_128
- ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmk_128
- ;CHECK: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
+; CHECK-LABEL: test_mask_add_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmkz_128
- ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -1721,45 +2061,63 @@ define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_add_epi16_rr_256
- ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrk_256
- ;CHECK: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
+; CHECK-LABEL: test_mask_add_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrkz_256
- ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi16_rm_256
- ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmk_256
- ;CHECK: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
+; CHECK-LABEL: test_mask_add_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmkz_256
- ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -1768,45 +2126,63 @@ define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_
declare <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rr_128
- ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrk_128
- ;CHECK: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
+; CHECK-LABEL: test_mask_sub_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrkz_128
- ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rm_128
- ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmk_128
- ;CHECK: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
+; CHECK-LABEL: test_mask_sub_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmkz_128
- ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -1815,45 +2191,63 @@ define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rr_256
- ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrk_256
- ;CHECK: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
+; CHECK-LABEL: test_mask_sub_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrkz_256
- ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rm_256
- ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmk_256
- ;CHECK: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
+; CHECK-LABEL: test_mask_sub_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmkz_256
- ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -1862,45 +2256,63 @@ define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_
declare <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <32 x i16> @test_mask_add_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
- ;CHECK-LABEL: test_mask_add_epi16_rr_512
- ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrk_512
- ;CHECK: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
+; CHECK-LABEL: test_mask_add_epi16_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rrkz_512
- ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1]
+; CHECK-LABEL: test_mask_add_epi16_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi16_rm_512
- ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmk_512
- ;CHECK: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
+; CHECK-LABEL: test_mask_add_epi16_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_add_epi16_rmkz_512
- ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07]
+; CHECK-LABEL: test_mask_add_epi16_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
@@ -1909,45 +2321,63 @@ define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_
declare <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <32 x i16> @test_mask_sub_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rr_512
- ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrk_512
- ;CHECK: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
+; CHECK-LABEL: test_mask_sub_epi16_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rrkz_512
- ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1]
+; CHECK-LABEL: test_mask_sub_epi16_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi16_rm_512
- ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmk_512
- ;CHECK: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
+; CHECK-LABEL: test_mask_sub_epi16_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi16_rmkz_512
- ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07]
+; CHECK-LABEL: test_mask_sub_epi16_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
@@ -1956,45 +2386,63 @@ define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_
declare <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <32 x i16> @test_mask_mullo_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rr_512
- ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrk_512
- ;CHECK: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_512
- ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rm_512
- ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmk_512
- ;CHECK: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi16_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
ret <32 x i16> %res
}
define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_512
- ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %res
@@ -2003,45 +2451,63 @@ define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
declare <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rr_128
- ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrk_128
- ;CHECK: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_128
- ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rm_128
- ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmk_128
- ;CHECK: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_128
- ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2050,45 +2516,63 @@ define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b
declare <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rr_256
- ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrk_256
- ;CHECK: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_256
- ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rm_256
- ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmk_256
- ;CHECK: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_mullo_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_256
- ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
+; CHECK-LABEL: test_mask_mullo_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2098,53 +2582,73 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16>, <16 x i16>, <16
define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rr_128
- ;CHECK: vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrk_128
- ;CHECK: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
+; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrkz_128
- ;CHECK: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rm_128
- ;CHECK: vpackssdw (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmk_128
- ;CHECK: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmkz_128
- ;CHECK: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmb_128
- ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2153,8 +2657,12 @@ define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbk_128
- ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2163,8 +2671,11 @@ define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x
}
define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_128
- ;CHECK: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2175,53 +2686,73 @@ define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8
declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rr_256
- ;CHECK: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrk_256
- ;CHECK: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
+; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rrkz_256
- ;CHECK: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
+; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rm_256
- ;CHECK: vpackssdw (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmk_256
- ;CHECK: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmkz_256
- ;CHECK: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmb_256
- ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2230,8 +2761,12 @@ define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbk_256
- ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
+; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2240,8 +2775,11 @@ define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16
}
define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_256
- ;CHECK: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
+; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2252,45 +2790,63 @@ define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i1
declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rr_128
- ;CHECK: vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrk_128
- ;CHECK: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0xd1]
+; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrkz_128
- ;CHECK: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rm_128
- ;CHECK: vpacksswb (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmk_128
- ;CHECK: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0x0f]
+; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmkz_128
- ;CHECK: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -2299,45 +2855,63 @@ define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b
declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rr_256
- ;CHECK: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrk_256
- ;CHECK: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0xd1]
+; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rrkz_256
- ;CHECK: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0xc1]
+; CHECK-LABEL: test_mask_packs_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packs_epi16_rm_256
- ;CHECK: vpacksswb (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmk_256
- ;CHECK: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0x0f]
+; CHECK-LABEL: test_mask_packs_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packs_epi16_rmkz_256
- ;CHECK: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0x07]
+; CHECK-LABEL: test_mask_packs_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -2347,53 +2921,73 @@ declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32
define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rr_128
- ;CHECK: vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrk_128
- ;CHECK: vpackusdw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrkz_128
- ;CHECK: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rm_128
- ;CHECK: vpackusdw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmk_128
- ;CHECK: vpackusdw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmkz_128
- ;CHECK: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmb_128
- ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2402,8 +2996,12 @@ define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbk_128
- ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2412,8 +3010,11 @@ define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8
}
define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_128
- ;CHECK: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2424,53 +3025,73 @@ define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8
declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rr_256
- ;CHECK: vpackusdw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrk_256
- ;CHECK: vpackusdw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rrkz_256
- ;CHECK: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rm_256
- ;CHECK: vpackusdw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmk_256
- ;CHECK: vpackusdw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmkz_256
- ;CHECK: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmb_256
- ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2479,8 +3100,12 @@ define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbk_256
- ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2489,8 +3114,11 @@ define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <1
}
define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_256
- ;CHECK: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2501,45 +3129,63 @@ define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i
declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rr_128
- ;CHECK: vpackuswb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrk_128
- ;CHECK: vpackuswb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrkz_128
- ;CHECK: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rm_128
- ;CHECK: vpackuswb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_packus_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmk_128
- ;CHECK: vpackuswb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmkz_128
- ;CHECK: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -2548,45 +3194,63 @@ define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_
declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rr_256
- ;CHECK: vpackuswb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrk_256
- ;CHECK: vpackuswb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rrkz_256
- ;CHECK: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_packus_epi16_rm_256
- ;CHECK: vpackuswb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_packus_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmk_256
- ;CHECK: vpackuswb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_packus_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_packus_epi16_rmkz_256
- ;CHECK: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_packus_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -2595,45 +3259,63 @@ define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %pt
declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rr_128
- ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrk_128
- ;CHECK: vpaddsw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrkz_128
- ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rm_128
- ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmk_128
- ;CHECK: vpaddsw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmkz_128
- ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2642,45 +3324,63 @@ define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rr_256
- ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrk_256
- ;CHECK: vpaddsw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rrkz_256
- ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi16_rm_256
- ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmk_256
- ;CHECK: vpaddsw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi16_rmkz_256
- ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2689,45 +3389,63 @@ define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rr_128
- ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrk_128
- ;CHECK: vpsubsw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrkz_128
- ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rm_128
- ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmk_128
- ;CHECK: vpsubsw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmkz_128
- ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2736,45 +3454,63 @@ define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rr_256
- ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrk_256
- ;CHECK: vpsubsw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rrkz_256
- ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi16_rm_256
- ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmk_256
- ;CHECK: vpsubsw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi16_rmkz_256
- ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2783,45 +3519,63 @@ define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rr_128
- ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrk_128
- ;CHECK: vpaddusw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrkz_128
- ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rm_128
- ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmk_128
- ;CHECK: vpaddusw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmkz_128
- ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2830,45 +3584,63 @@ define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rr_256
- ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrk_256
- ;CHECK: vpaddusw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rrkz_256
- ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu16_rm_256
- ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmk_256
- ;CHECK: vpaddusw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu16_rmkz_256
- ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2877,45 +3649,63 @@ define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rr_128
- ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu16_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrk_128
- ;CHECK: vpsubusw %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrkz_128
- ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rm_128
- ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu16_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmk_128
- ;CHECK: vpsubusw (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
ret <8 x i16> %res
}
define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmkz_128
- ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
ret <8 x i16> %res
@@ -2924,45 +3714,63 @@ define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b,
declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rr_256
- ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu16_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrk_256
- ;CHECK: vpsubusw %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rrkz_256
- ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu16_rm_256
- ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu16_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmk_256
- ;CHECK: vpsubusw (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
ret <16 x i16> %res
}
define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu16_rmkz_256
- ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
ret <16 x i16> %res
@@ -2971,45 +3779,63 @@ define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rr_128
- ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrk_128
- ;CHECK: vpaddsb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrkz_128
- ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rm_128
- ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epi8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmk_128
- ;CHECK: vpaddsb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmkz_128
- ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3018,45 +3844,63 @@ define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rr_256
- ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrk_256
- ;CHECK: vpaddsb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rrkz_256
- ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epi8_rm_256
- ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epi8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmk_256
- ;CHECK: vpaddsb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epi8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epi8_rmkz_256
- ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epi8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3065,45 +3909,63 @@ define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b,
declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rr_128
- ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrk_128
- ;CHECK: vpsubsb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrkz_128
- ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rm_128
- ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epi8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmk_128
- ;CHECK: vpsubsb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmkz_128
- ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3112,45 +3974,63 @@ define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rr_256
- ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrk_256
- ;CHECK: vpsubsb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rrkz_256
- ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epi8_rm_256
- ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epi8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmk_256
- ;CHECK: vpsubsb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epi8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epi8_rmkz_256
- ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epi8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3159,45 +4039,63 @@ define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b,
declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rr_128
- ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrk_128
- ;CHECK: vpaddusb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrkz_128
- ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rm_128
- ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_adds_epu8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmk_128
- ;CHECK: vpaddusb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmkz_128
- ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3206,45 +4104,63 @@ define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rr_256
- ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrk_256
- ;CHECK: vpaddusb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rrkz_256
- ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_adds_epu8_rm_256
- ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_adds_epu8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmk_256
- ;CHECK: vpaddusb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_adds_epu8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_adds_epu8_rmkz_256
- ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_adds_epu8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3253,45 +4169,63 @@ define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b,
declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rr_128
- ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu8_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrk_128
- ;CHECK: vpsubusb %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrkz_128
- ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rm_128
- ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0
+; CHECK-LABEL: test_mask_subs_epu8_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmk_128
- ;CHECK: vpsubusb (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
ret <16 x i8> %res
}
define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmkz_128
- ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
ret <16 x i8> %res
@@ -3300,45 +4234,63 @@ define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b,
declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rr_256
- ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu8_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrk_256
- ;CHECK: vpsubusb %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rrkz_256
- ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
- ;CHECK-LABEL: test_mask_subs_epu8_rm_256
- ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0
+; CHECK-LABEL: test_mask_subs_epu8_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmk_256
- ;CHECK: vpsubusb (%rdi), %ymm0, %ymm1 {%k1}
+; CHECK-LABEL: test_mask_subs_epu8_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
ret <32 x i8> %res
}
define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
- ;CHECK-LABEL: test_mask_subs_epu8_rmkz_256
- ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mask_subs_epu8_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
ret <32 x i8> %res
@@ -3348,11 +4300,14 @@ declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x
declare <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_128
-; CHECK-NOT: call
-; CHECK: vpmaxsb %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1]
+; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3361,11 +4316,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_256
-; CHECK-NOT: call
-; CHECK: vpmaxsb %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1]
+; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3c,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3374,11 +4332,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_128
-; CHECK-NOT: call
-; CHECK: vpmaxsw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1]
+; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xee,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3387,11 +4348,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_256
-; CHECK-NOT: call
-; CHECK: vpmaxsw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1]
+; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3400,11 +4364,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16
declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_128
-; CHECK-NOT: call
-; CHECK: vpmaxub %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2,i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1]
+; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3413,11 +4380,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_256
-; CHECK-NOT: call
-; CHECK: vpmaxub %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1]
+; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xde,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3426,11 +4396,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_128
-; CHECK-NOT: call
-; CHECK: vpmaxuw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1]
+; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x3e,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3439,11 +4412,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_256
-; CHECK-NOT: call
-; CHECK: vpmaxuw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1]
+; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3452,11 +4428,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16
declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_128
-; CHECK-NOT: call
-; CHECK: vpminsb %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1]
+; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3465,11 +4444,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_256
-; CHECK-NOT: call
-; CHECK: vpminsb %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1]
+; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x38,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3478,11 +4460,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_128
-; CHECK-NOT: call
-; CHECK: vpminsw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1]
+; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xea,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3491,11 +4476,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_256
-; CHECK-NOT: call
-; CHECK: vpminsw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1]
+; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3504,11 +4492,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16
declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_128
-; CHECK-NOT: call
-; CHECK: vpminub %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1]
+; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
%res2 = add <16 x i8> %res, %res1
@@ -3517,11 +4508,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_256
-; CHECK-NOT: call
-; CHECK: vpminub %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1]
+; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xda,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3530,11 +4524,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %
declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_128
-; CHECK-NOT: call
-; CHECK: vpminuw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1]
+; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x3a,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3543,11 +4540,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %
declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_256
-; CHECK-NOT: call
-; CHECK: vpminuw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1]
+; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
%res2 = add <16 x i16> %res, %res1
@@ -3556,12 +4556,15 @@ define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16
declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %xmm{{.*}}{%k1}
-; CHECK-NOT: {z}
define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xca]
+; CHECK-NEXT: vpaddw %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3570,11 +4573,15 @@ define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x
declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %xmm{{.*}}{%k1} {z}
define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xca]
+; CHECK-NEXT: vpaddw %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3583,11 +4590,15 @@ define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x
declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %ymm{{.*}}{%k1}
define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xca]
+; CHECK-NEXT: vpaddw %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3596,11 +4607,15 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16
declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2w %ymm{{.*}}{%k1} {z}
define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xda]
+; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xca]
+; CHECK-NEXT: vpaddw %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3609,11 +4624,15 @@ define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <1
declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2w %xmm{{.*}}{%k1}
define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x75,0xda]
+; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x75,0xca]
+; CHECK-NEXT: vpaddw %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3622,11 +4641,15 @@ define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x
declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2w %ymm{{.*}}{%k1}
define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x75,0xda]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x75,0xca]
+; CHECK-NEXT: vpaddw %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3635,11 +4658,14 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16
declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_128
-; CHECK-NOT: call
-; CHECK: vpavgb %xmm
-; CHECK: {%k1}
define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
+; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe0,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
%res2 = add <16 x i8> %res, %res1
@@ -3648,11 +4674,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x
declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_256
-; CHECK-NOT: call
-; CHECK: vpavgb %ymm
-; CHECK: {%k1}
define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe0,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3661,11 +4690,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x
declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_128
-; CHECK-NOT: call
-; CHECK: vpavgw %xmm
-; CHECK: {%k1}
define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
+; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe3,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3674,11 +4706,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x
declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_256
-; CHECK-NOT: call
-; CHECK: vpavgw %ymm
-; CHECK: {%k1}
define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe3,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3687,11 +4722,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16>
declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpshufb %xmm{{.*}}{%k1}
define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1]
+; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
%res2 = add <16 x i8> %res, %res1
@@ -3700,11 +4738,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %
declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpshufb %ymm{{.*}}{%k1}
define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1]
+; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3713,11 +4754,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %
declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsb{{.*}}{%k1}
define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1c,0xc8]
+; CHECK-NEXT: vpabsb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x1c,0xc0]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
%res2 = add <16 x i8> %res, %res1
@@ -3726,11 +4770,14 @@ define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x
declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsb{{.*}}{%k1}
define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpabsb %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1c,0xc8]
+; CHECK-NEXT: vpabsb %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x1c,0xc0]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
%res2 = add <32 x i8> %res, %res1
@@ -3739,11 +4786,14 @@ define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x
declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsw{{.*}}{%k1}
define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1d,0xc8]
+; CHECK-NEXT: vpabsw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x1d,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3752,57 +4802,30 @@ define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x
declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsw{{.*}}{%k1}
define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsw %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1d,0xc8]
+; CHECK-NEXT: vpabsw %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x1d,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
-; CHECK-LABEL: test_x86_mask_blend_b_256
-; CHECK: vpblendmb
-define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
- %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
- ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_w_256
-define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
- ; CHECK: vpblendmw
- %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_b_128
-; CHECK: vpblendmb
-define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
- %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_w_128
-define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
- ; CHECK: vpblendmw
- %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
-
declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhuw {{.*}}encoding: [0x62
define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1]
+; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe4,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3811,12 +4834,14 @@ define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16>
declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhuw {{.*}}encoding: [0x62
define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1]
+; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe4,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3825,12 +4850,14 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i1
declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhw {{.*}}encoding: [0x62
define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1]
+; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe5,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3838,12 +4865,15 @@ define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %
}
declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhw {{.*}}encoding: [0x62
+
define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1]
+; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe5,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3851,12 +4881,15 @@ define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16
}
declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+
define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1]
+; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x0b,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -3864,12 +4897,15 @@ define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16>
}
declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: {%k1}
-; CHECK: vpmulhrsw {{.*}}encoding: [0x62
+
define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1]
+; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x0b,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
@@ -3880,9 +4916,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
-; CHECK: vpmovwb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovwb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovwb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
+; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
+; CHECK-NEXT: vpmovwb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3895,8 +4936,11 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
-; CHECK: vpmovwb %xmm0, (%rdi)
-; CHECK: vpmovwb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovwb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0x07]
+; CHECK-NEXT: vpmovwb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
ret void
@@ -3906,9 +4950,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
-; CHECK: vpmovswb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovswb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
+; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
+; CHECK-NEXT: vpmovswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3921,8 +4970,11 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
-; CHECK: vpmovswb %xmm0, (%rdi)
-; CHECK: vpmovswb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
ret void
@@ -3932,9 +4984,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
-; CHECK: vpmovuswb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovuswb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovuswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
+; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
+; CHECK-NEXT: vpmovuswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3947,8 +5004,11 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
-; CHECK: vpmovuswb %xmm0, (%rdi)
-; CHECK: vpmovuswb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
ret void
@@ -3958,9 +5018,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
-; CHECK: vpmovwb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovwb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovwb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
+; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2]
+; CHECK-NEXT: vpmovwb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
@@ -3973,8 +5038,11 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
-; CHECK: vpmovwb %ymm0, (%rdi)
-; CHECK: vpmovwb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovwb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0x07]
+; CHECK-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
ret void
@@ -3984,9 +5052,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
-; CHECK: vpmovswb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovswb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
+; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2]
+; CHECK-NEXT: vpmovswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
@@ -3999,8 +5072,11 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
-; CHECK: vpmovswb %ymm0, (%rdi)
-; CHECK: vpmovswb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
ret void
@@ -4010,9 +5086,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16
define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
-; CHECK: vpmovuswb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovuswb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
+; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2]
+; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
@@ -4025,8 +5106,11 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
-; CHECK: vpmovuswb %ymm0, (%rdi)
-; CHECK: vpmovuswb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
ret void
@@ -4037,12 +5121,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x
define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1]
+; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf5,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4054,12 +5137,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8
define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1]
+; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf5,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4071,12 +5153,11 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8
define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1]
+; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x04,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
@@ -4088,182 +5169,29 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <1
define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1]
+; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x04,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
-declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
-; CHECK: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[8],k1[8],xmm2[9],k1[9],xmm2[10],k1[10],xmm2[11],k1[11],xmm2[12],k1[12],xmm2[13],k1[13],xmm2[14],k1[14],xmm2[15],k1[15]
-; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
- %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
- %res2 = add <16 x i8> %res, %res1
- ret <16 x i8> %res2
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
-; CHECK: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3],xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
-; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
- %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
- %res2 = add <16 x i8> %res, %res1
- ret <16 x i8> %res2
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
-; CHECK: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15],ymm2[24],k1[24],ymm2[25],k1[25],ymm2[26],k1[26],ymm2[27],k1[27],ymm2[28],k1[28],ymm2[29],k1[29],ymm2[30],k1[30],ymm2[31],k1[31]
-; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
- %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
- %res2 = add <32 x i8> %res, %res1
- ret <32 x i8> %res2
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
-; CHECK: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[16],k1[16],ymm2[17],k1[17],ymm2[18],k1[18],ymm2[19],k1[19],ymm2[20],k1[20],ymm2[21],k1[21],ymm2[22],k1[22],ymm2[23],k1[23]
-; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
- %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
- %res2 = add <32 x i8> %res, %res1
- ret <32 x i8> %res2
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
-; CHECK: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3]
-; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
- %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
- %res2 = add <8 x i16> %res, %res1
- ret <8 x i16> %res2
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
-; CHECK: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
-; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
- %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
- %res2 = add <8 x i16> %res, %res1
- ret <8 x i16> %res2
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
-; CHECK: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11]
-; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
- %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
- %res2 = add <16 x i16> %res, %res1
- ret <16 x i16> %res2
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
-; CHECK: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15]
-; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
- %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
- %res2 = add <16 x i16> %res, %res1
- ret <16 x i16> %res2
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
- %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1)
- %res3 = add <16 x i8> %res, %res1
- %res4 = add <16 x i8> %res3, %res2
- ret <16 x i8> %res4
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
- %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1)
- %res3 = add <32 x i8> %res, %res1
- %res4 = add <32 x i8> %res3, %res2
- ret <32 x i8> %res4
-}
-
declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x42,0xc1,0x02]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1)
@@ -4277,13 +5205,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32,
define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x02]
+; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x42,0xc1,0x02]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1)
@@ -4292,135 +5220,15 @@ define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8>
ret <16 x i16> %res4
}
-declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
- %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
- %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask)
- %res3 = add <32 x i8> %res, %res1
- %res4 = add <32 x i8> %res2, %res3
- ret <32 x i8> %res4
-}
-
-declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
- %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
- %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask)
- %res3 = add <16 x i8> %res, %res1
- %res4 = add <16 x i8> %res2, %res3
- ret <16 x i8> %res4
-}
-
-declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
- %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
- %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask)
- %res3 = add <16 x i16> %res, %res1
- %res4 = add <16 x i16> %res2, %res3
- ret <16 x i16> %res4
-}
-
-declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
- %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
- %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask)
- %res3 = add <8 x i16> %res, %res1
- %res4 = add <8 x i16> %res2, %res3
- ret <8 x i16> %res4
-}
-
-declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xd0]
-; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0]
-; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
-; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
- %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
- %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask)
- %res3 = add <64 x i8> %res, %res1
- %res4 = add <64 x i8> %res2, %res3
- ret <64 x i8> %res4
-}
-
-declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xd0]
-; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0]
-; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
-; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
- %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
- %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask)
- %res3 = add <32 x i16> %res, %res1
- %res4 = add <32 x i16> %res2, %res3
- ret <32 x i16> %res4
-}
-
declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>)
define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovb2m %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovb2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xc0]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0)
ret i16 %res
}
@@ -4430,9 +5238,9 @@ declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>)
define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovb2m %ymm0, %k0
-; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovb2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x29,0xc0]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8> %x0)
ret i32 %res
}
@@ -4442,9 +5250,10 @@ declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>)
define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovw2m %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovw2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc0]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0)
ret i8 %res
}
@@ -4454,9 +5263,10 @@ declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>)
define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovw2m %ymm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovw2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x29,0xc0]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0)
ret i16 %res
}
@@ -4466,9 +5276,9 @@ declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16)
define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpmovm2b %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT: vpmovm2b %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16 %x0)
ret <16 x i8> %res
}
@@ -4478,9 +5288,9 @@ declare <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32)
define <32 x i8>@test_int_x86_avx512_cvtmask2b_256(i32 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k0
-; CHECK-NEXT: vpmovm2b %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
+; CHECK-NEXT: vpmovm2b %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32 %x0)
ret <32 x i8> %res
}
@@ -4490,10 +5300,9 @@ declare <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8)
define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: vpmovm2w %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT: vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8 %x0)
ret <8 x i16> %res
}
@@ -4503,9 +5312,9 @@ declare <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16)
define <16 x i16>@test_int_x86_avx512_cvtmask2w_256(i16 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpmovm2w %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT: vpmovm2w %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x28,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16 %x0)
ret <16 x i16> %res
}
@@ -4515,14 +5324,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
+; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xd9]
+; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd1,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -4536,13 +5344,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xd9]
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd1,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -4551,42 +5359,41 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16>
ret <16 x i16> %res4
}
-declare <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16>, i32, <8 x i16>, i8)
-define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x71,0xd0,0x03]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
%res3 = add <8 x i16> %res, %res1
%res4 = add <8 x i16> %res2, %res3
ret <8 x i16> %res4
}
-declare <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16>, i32, <16 x i16>, i16)
-define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xd0,0x03]
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x71,0xd0,0x03]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
%res3 = add <16 x i16> %res, %res1
%res4 = add <16 x i16> %res3, %res2
ret <16 x i16> %res4
@@ -4597,13 +5404,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x10,0xd1]
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x10,0xd9]
+; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x10,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4617,14 +5424,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x10,0xd1]
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x10,0xd9]
+; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x10,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4638,14 +5444,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1]
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xd9]
+; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe1,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4654,22 +5459,21 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x
ret <8 x i16> %res4
}
-declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i32, <8 x i16>, i8)
-define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x71,0xe0,0x03]
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
%res3 = add <8 x i16> %res, %res1
%res4 = add <8 x i16> %res3, %res2
ret <8 x i16> %res4
@@ -4680,13 +5484,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1]
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xd9]
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe1,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4695,151 +5499,21 @@ define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16>
ret <16 x i16> %res4
}
-declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i32, <16 x i16>, i16)
-define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
- %res3 = add <16 x i16> %res, %res1
- %res4 = add <16 x i16> %res3, %res2
- ret <16 x i16> %res4
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i16, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i16 %x1, <4 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[3,0,0,0]
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i16 3, <4 x i32> %x2, i8 -1)
- %res3 = add <4 x i32> %res, %res1
- %res4 = add <4 x i32> %res3, %res2
- ret <4 x i32> %res4
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i16, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i16 %x1, <8 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i16 3, <8 x i32> %x2, i8 -1)
- %res3 = add <8 x i32> %res, %res1
- %res4 = add <8 x i32> %res3, %res2
- ret <8 x i32> %res4
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i8, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,7,4,4,4]
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
- %res3 = add <8 x i16> %res, %res1
- %res4 = add <8 x i16> %res3, %res2
- ret <8 x i16> %res4
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i8, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
- %res3 = add <16 x i16> %res, %res1
- %res4 = add <16 x i16> %res3, %res2
- ret <16 x i16> %res4
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i8, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[3,0,0,0,4,5,6,7]
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
- %res3 = add <8 x i16> %res, %res1
- %res4 = add <8 x i16> %res3, %res2
- ret <8 x i16> %res4
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i8, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xe0,0x03]
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x71,0xe0,0x03]
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
%res3 = add <16 x i16> %res, %res1
%res4 = add <16 x i16> %res3, %res2
ret <16 x i16> %res4
@@ -4850,13 +5524,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav16_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x11,0xd1]
+; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x11,0xd9]
+; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x11,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4870,14 +5544,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x11,0xd1]
+; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x11,0xd9]
+; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x11,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4892,14 +5565,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1]
+; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xd9]
+; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf1,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -4913,13 +5585,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1]
+; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xd9]
+; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf1,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4928,42 +5600,41 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16>
ret <16 x i16> %res4
}
-declare <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16>, i8, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16>, i32, <8 x i16>, i8)
-define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
+define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> zeroinitializer, i8 %x3)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i8 3, <8 x i16> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x71,0xf0,0x03]
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
%res3 = add <8 x i16> %res, %res1
%res4 = add <8 x i16> %res3, %res2
ret <8 x i16> %res4
}
-declare <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16>, i8, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16>, i32, <16 x i16>, i16)
-define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i8 %x1, <16 x i16> %x2, i16 %x3) {
+define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsllw $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> zeroinitializer, i16 %x3)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i8 3, <16 x i16> %x2, i16 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x71,0xf0,0x03]
+; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x71,0xf0,0x03]
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1)
%res3 = add <16 x i16> %res, %res1
%res4 = add <16 x i16> %res3, %res2
ret <16 x i16> %res4
@@ -4974,13 +5645,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv16_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x12,0xd1]
+; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x12,0xd9]
+; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x12,0xc1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -4994,14 +5665,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x12,0xd1]
+; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x12,0xd9]
+; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x12,0xc1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -5015,14 +5685,16 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x30,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1)
@@ -5036,13 +5708,16 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8>, <16 x i16>, i1
define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxbw %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x30,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1)
@@ -5057,14 +5732,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8]
+; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xd0]
+; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x20,0xc0]
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xca]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1)
@@ -5078,13 +5752,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8>, <16 x i16>, i1
define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8]
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xd0]
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x20,0xc0]
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xca]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
%res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1)
@@ -5098,14 +5772,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xd0]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x25,0xc0]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1)
@@ -5119,14 +5792,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8]
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xd0]
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x25,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1)
@@ -5135,3 +5807,272 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64>
ret <4 x i64> %res4
}
+declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0]
+; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xd8]
+; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xc0]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res3, %res2
+ ret <8 x i16> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xd8]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xc0]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xcb]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res3, %res2
+ ret <16 x i16> %res4
+}
+
+declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8)
+
+define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2)
+
+define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpbroadcastb %dil, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
+; CHECK-NEXT: vpbroadcastb %dil, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
+; CHECK-NEXT: vpbroadcastb %dil, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res2, %res3
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastb %dil, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
+; CHECK-NEXT: vpbroadcastb %dil, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
+; CHECK-NEXT: vpbroadcastb %dil, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res2, %res3
+ ret <16 x i8> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastw %di, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
+; CHECK-NEXT: vpbroadcastw %di, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
+; CHECK-NEXT: vpbroadcastw %di, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastw %di, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
+; CHECK-NEXT: vpbroadcastw %di, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
+; CHECK-NEXT: vpbroadcastw %di, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res2, %res3
+ ret <8 x i16> %res4
+}
diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
index 8a9a4fa5e5e2..6bd9c9384050 100644
--- a/test/CodeGen/X86/avx512bwvl-mov.ll
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -1,27 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
-; CHECK-LABEL: test_256_1
-; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
-; CHECK: ret
define <32 x i8> @test_256_1(i8 * %addr) {
+; CHECK-LABEL: test_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*
%res = load <32 x i8>, <32 x i8>* %vaddr, align 1
ret <32 x i8>%res
}
-; CHECK-LABEL: test_256_2
-; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_2(i8 * %addr, <32 x i8> %data) {
+; CHECK-LABEL: test_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7f,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*
store <32 x i8>%data, <32 x i8>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_3
-; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
+; CHECK-LABEL: test_256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmb (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <32 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i8>*
%r = load <32 x i8>, <32 x i8>* %vaddr, align 1
@@ -29,10 +35,13 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
ret <32 x i8>%res
}
-; CHECK-LABEL: test_256_4
-; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
+; CHECK-LABEL: test_256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <32 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i8>*
%r = load <32 x i8>, <32 x i8>* %vaddr, align 1
@@ -40,28 +49,33 @@ define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
ret <32 x i8>%res
}
-; CHECK-LABEL: test_256_5
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define <16 x i16> @test_256_5(i8 * %addr) {
+; CHECK-LABEL: test_256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xff,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*
%res = load <16 x i16>, <16 x i16>* %vaddr, align 1
ret <16 x i16>%res
}
-; CHECK-LABEL: test_256_6
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_6(i8 * %addr, <16 x i16> %data) {
+; CHECK-LABEL: test_256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xff,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*
store <16 x i16>%data, <16 x i16>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_7
-; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
+; CHECK-LABEL: test_256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmw (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i16>*
%r = load <16 x i16>, <16 x i16>* %vaddr, align 1
@@ -69,10 +83,13 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
ret <16 x i16>%res
}
-; CHECK-LABEL: test_256_8
-; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
+; CHECK-LABEL: test_256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i16>*
%r = load <16 x i16>, <16 x i16>* %vaddr, align 1
@@ -80,28 +97,33 @@ define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
ret <16 x i16>%res
}
-; CHECK-LABEL: test_128_1
-; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
-; CHECK: ret
define <16 x i8> @test_128_1(i8 * %addr) {
+; CHECK-LABEL: test_128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*
%res = load <16 x i8>, <16 x i8>* %vaddr, align 1
ret <16 x i8>%res
}
-; CHECK-LABEL: test_128_2
-; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_2(i8 * %addr, <16 x i8> %data) {
+; CHECK-LABEL: test_128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu8 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7f,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*
store <16 x i8>%data, <16 x i8>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_3
-; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
+; CHECK-LABEL: test_128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmb (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i8>*
%r = load <16 x i8>, <16 x i8>* %vaddr, align 1
@@ -109,10 +131,13 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
ret <16 x i8>%res
}
-; CHECK-LABEL: test_128_4
-; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
+; CHECK-LABEL: test_128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i8>*
%r = load <16 x i8>, <16 x i8>* %vaddr, align 1
@@ -120,28 +145,33 @@ define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
ret <16 x i8>%res
}
-; CHECK-LABEL: test_128_5
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define <8 x i16> @test_128_5(i8 * %addr) {
+; CHECK-LABEL: test_128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*
%res = load <8 x i16>, <8 x i16>* %vaddr, align 1
ret <8 x i16>%res
}
-; CHECK-LABEL: test_128_6
-; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_6(i8 * %addr, <8 x i16> %data) {
+; CHECK-LABEL: test_128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu16 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xff,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*
store <8 x i16>%data, <8 x i16>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_7
-; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
-; CHECK: ret
define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
+; CHECK-LABEL: test_128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04]
+; CHECK-NEXT: vpblendmw (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x66,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i16>*
%r = load <8 x i16>, <8 x i16>* %vaddr, align 1
@@ -149,10 +179,13 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
ret <8 x i16>%res
}
-; CHECK-LABEL: test_128_8
-; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
-; CHECK: ret
define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
+; CHECK-LABEL: test_128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i16>*
%r = load <8 x i16>, <8 x i16>* %vaddr, align 1
diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
index 9bf02fa41d9a..17e581bbb501 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
@@ -1,94 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; CHECK-LABEL: test256_1
-; CHECK: vpcmpeqb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: test256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
ret <32 x i8> %max
}
-; CHECK-LABEL: test256_2
-; CHECK: vpcmpgtb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+; CHECK-LABEL: test256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
ret <32 x i8> %max
}
-; CHECK-LABEL: @test256_3
-; CHECK: vpcmplew {{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind {
+; CHECK-LABEL: test256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1
+; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
ret <16 x i16> %max
}
-; CHECK-LABEL: test256_4
-; CHECK: vpcmpnleub {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+; CHECK-LABEL: test256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
ret <32 x i8> %max
}
-; CHECK-LABEL: test256_5
-; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind {
+; CHECK-LABEL: test256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %yp, align 4
%mask = icmp eq <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_6
-; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sgt <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_7
-; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sle <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_8
-; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp ule <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_9
-; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+; CHECK-LABEL: test256_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i16> %x1, %y1
%mask0 = icmp eq <16 x i16> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -96,11 +107,13 @@ define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x
ret <16 x i16> %max
}
-; CHECK-LABEL: @test256_10
-; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+; CHECK-LABEL: test256_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i8> %x1, %y1
%mask0 = icmp sle <32 x i8> %x, %y
%mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
@@ -108,11 +121,13 @@ define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8
ret <32 x i8> %max
}
-; CHECK-LABEL: @test256_11
-; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+; CHECK-LABEL: test256_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <32 x i8> %x1, %y1
%y = load <32 x i8>, <32 x i8>* %y.ptr, align 4
%mask0 = icmp sgt <32 x i8> %x, %y
@@ -121,11 +136,13 @@ define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32
ret <32 x i8> %max
}
-; CHECK-LABEL: @test256_12
-; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+; CHECK-LABEL: test256_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i16> %x1, %y1
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask0 = icmp ule <16 x i16> %x, %y
@@ -134,95 +151,105 @@ define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1,
ret <16 x i16> %max
}
-; CHECK-LABEL: test128_1
-; CHECK: vpcmpeqb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
+; CHECK-LABEL: test128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
ret <16 x i8> %max
}
-; CHECK-LABEL: test128_2
-; CHECK: vpcmpgtb {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+; CHECK-LABEL: test128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
ret <16 x i8> %max
}
-; CHECK-LABEL: @test128_3
-; CHECK: vpcmplew {{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind {
+; CHECK-LABEL: test128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1
+; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
ret <8 x i16> %max
}
-; CHECK-LABEL: test128_4
-; CHECK: vpcmpnleub {{.*%k[0-7]}}
-; CHECK: vmovdqu8 {{.*}}%k1
-; CHECK: ret
define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+; CHECK-LABEL: test128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
ret <16 x i8> %max
}
-; CHECK-LABEL: test128_5
-; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind {
+; CHECK-LABEL: test128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %yp, align 4
%mask = icmp eq <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_6
-; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sgt <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_7
-; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sle <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_8
-; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp ule <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_9
-; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+; CHECK-LABEL: test128_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i16> %x1, %y1
%mask0 = icmp eq <8 x i16> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -230,11 +257,13 @@ define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16>
ret <8 x i16> %max
}
-; CHECK-LABEL: @test128_10
-; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+; CHECK-LABEL: test128_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i8> %x1, %y1
%mask0 = icmp sle <16 x i8> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -242,11 +271,13 @@ define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8
ret <16 x i8> %max
}
-; CHECK-LABEL: @test128_11
-; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu8
-; CHECK: ret
define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+; CHECK-LABEL: test128_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <16 x i8> %x1, %y1
%y = load <16 x i8>, <16 x i8>* %y.ptr, align 4
%mask0 = icmp sgt <16 x i8> %x, %y
@@ -255,11 +286,13 @@ define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16
ret <16 x i8> %max
}
-; CHECK-LABEL: @test128_12
-; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqu16
-; CHECK: ret
define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+; CHECK-LABEL: test128_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i16> %x1, %y1
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask0 = icmp ule <8 x i16> %x, %y
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 14e91e1a8768..b27b795b4409 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly
@@ -7,8 +8,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vplzcntd %xmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vplzcntd %xmm0, %xmm0
@@ -28,8 +28,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vplzcntd %ymm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -45,8 +44,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vplzcntq %xmm0, %xmm0
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
@@ -62,8 +60,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vplzcntq %ymm0, %ymm0
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -79,8 +76,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictd %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vpconflictd %xmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpconflictd %xmm0, %xmm0
@@ -100,8 +96,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictd %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vpconflictd %ymm0, %ymm0
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -117,8 +112,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vpconflictq %xmm0, %xmm0
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
@@ -134,8 +128,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vpconflictq %ymm0, %ymm0
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
@@ -147,33 +140,45 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i
}
define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
- ; CHECK: test_x86_vbroadcastmw_256
- ; CHECK: vpbroadcastmw2d %k0, %ymm0
- %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
+; CHECK-LABEL: test_x86_vbroadcastmw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
- ; CHECK: test_x86_vbroadcastmw_128
- ; CHECK: vpbroadcastmw2d %k0, %xmm0
- %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
+; CHECK-LABEL: test_x86_vbroadcastmw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
- ; CHECK: test_x86_broadcastmb_256
- ; CHECK: vpbroadcastmb2q %k0, %ymm0
- %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
+; CHECK-LABEL: test_x86_broadcastmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
- ; CHECK: test_x86_broadcastmb_128
- ; CHECK: vpbroadcastmb2q %k0, %xmm0
- %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
+; CHECK-LABEL: test_x86_broadcastmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index a59fe393f556..35db4901135f 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -1,4 +1,4 @@
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s
declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32)
@@ -194,13 +194,15 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_512(<8 x i64> %x0, <8 x f
}
declare <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducepd {{.*}}{%k1}
-; CHECK: vreducepd
-; CHECK: {sae}
+
define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vreducepd $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vreducepd $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 8, <8 x double> %x2, i8 %x3, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double> %x0, i32 4, <8 x double> %x2, i8 -1, i32 8)
%res2 = fadd <8 x double> %res, %res1
@@ -208,14 +210,15 @@ define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8
}
declare <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreduceps
-; CHECK: {sae}
-; CKECK: {%k1}
-; CHECK: vreduceps
+
define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vreduceps $44, {sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vreduceps $11, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 44, <16 x float> %x2, i16 %x3, i32 8)
%res1 = call <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 4)
%res2 = fadd <16 x float> %res, %res1
@@ -223,14 +226,15 @@ define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16
}
declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangepd
-; CKECK: {%k1}
-; CHECK: vrangepd
-; CHECK: {sae}
+
define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vrangepd $8, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vrangepd $4, {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 8, <8 x double> %x3, i8 %x4, i32 4)
%res1 = call <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double> %x0, <8 x double> %x1, i32 4, <8 x double> %x3, i8 -1, i32 8)
%res2 = fadd <8 x double> %res, %res1
@@ -239,14 +243,14 @@ define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x
declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangeps
-; CKECK: {%k1}
-; CHECK: vrangeps
-; CHECK: {sae}
define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrangeps $88, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vrangeps $4, {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 88, <16 x float> %x3, i16 %x4, i32 4)
%res1 = call <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float> %x0, <16 x float> %x1, i32 4, <16 x float> %x3, i16 -1, i32 8)
%res2 = fadd <16 x float> %res, %res1
@@ -255,14 +259,15 @@ define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16
declare <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ss
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducess
-; CKECK: {%k1}
-; CHECK: vreducess
-; CHECK: {sae}
define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vreducess $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vreducess $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <4 x float> %res, %res1
@@ -270,15 +275,16 @@ define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x floa
}
declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ss
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangess
-; CHECK: {sae}
-; CKECK: {%k1}
-; CHECK: vrangess
-; CHECK: {sae}
+
define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 8)
%res1 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <4 x float> %res, %res1
@@ -287,14 +293,15 @@ define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float
declare <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_sd
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducesd
-; CKECK: {%k1}
-; CHECK: vreducesd
-; CHECK: {sae}
define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vreducesd $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vreducesd $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <2 x double> %res, %res1
@@ -302,14 +309,16 @@ define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x do
}
declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_sd
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangesd
-; CKECK: {%k1}
-; CHECK: vrangesd
-; CHECK: {sae}
+
define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
%res2 = fadd <2 x double> %res, %res1
@@ -439,14 +448,17 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i6
declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasspd
-; CHECK: {%k1}
-; CHECK: vfpclasspd
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vfpclasspd $2, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %ecx
+; CHECK-NEXT: vfpclasspd $4, %zmm0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -454,14 +466,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
}
declare i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float>, i32, i16)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_512
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclassps
-; CHECK: vfpclassps
-; CHECK: {%k1}
-; CHECK: kmov
define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfpclassps $4, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: vfpclassps $4, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 %x1)
%res1 = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 -1)
%res2 = add i16 %res, %res1
@@ -470,14 +485,28 @@ define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_sd
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasssd
-; CHECK: %k0 {%k1}
-; CHECK: vfpclasssd
-; CHECK: %k0
define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je LBB28_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: movb $-1, %al
+; CHECK-NEXT: LBB28_2:
+; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: je LBB28_4
+; CHECK-NEXT: ## BB#3:
+; CHECK-NEXT: movb $-1, %cl
+; CHECK-NEXT: LBB28_4:
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -486,16 +515,28 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ss
-; CHECK-NOT: call
-; CHECK: kmovw
-; CHECK: vfpclassss
-; CHECK: %k0
-; CHECK: {%k1}
-; CHECK: kmovw
-; CHECK: vfpclassss
-; CHECK: %k0
define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: je LBB29_2
+; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: movb $-1, %al
+; CHECK-NEXT: LBB29_2:
+; CHECK-NEXT: vfpclassss $4, %xmm0, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: je LBB29_4
+; CHECK-NEXT: ## BB#3:
+; CHECK-NEXT: movb $-1, %cl
+; CHECK-NEXT: LBB29_4:
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -549,6 +590,7 @@ define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovd2m %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0)
ret i16 %res
@@ -561,6 +603,7 @@ define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0)
ret i8 %res
@@ -594,12 +637,15 @@ declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff32x4 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vaddps %zmm1, %zmm0, %zmm0
-; CHECK: vaddps %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1)
%res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
@@ -613,12 +659,15 @@ declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff64x2 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vaddpd %zmm1, %zmm0, %zmm0
-; CHECK: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1)
%res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
@@ -632,12 +681,15 @@ declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32
define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi32x4 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1)
%res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
@@ -651,12 +703,15 @@ declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi64x2 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1)
%res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
index b4d11bc0b77b..27c0b06d5f23 100644
--- a/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -1,38 +1,69 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
define i8 @mask8(i8 %x) {
+; CHECK-LABEL: mask8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k0
+; CHECK-NEXT: knotb %k0, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
ret i8 %ret
-; CHECK: mask8
-; CHECK: knotb
-; CHECK: ret
}
define void @mask8_mem(i8* %ptr) {
+; CHECK-LABEL: mask8_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb (%rdi), %k0
+; CHECK-NEXT: knotb %k0, %k0
+; CHECK-NEXT: kmovb %k0, (%rdi)
+; CHECK-NEXT: retq
%x = load i8, i8* %ptr, align 4
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
%ret = bitcast <8 x i1> %m1 to i8
store i8 %ret, i8* %ptr, align 4
ret void
-; CHECK-LABEL: mask8_mem
-; CHECK: kmovb ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
-; CHECK-NEXT: knotb
-; CHECK-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
-; CHECK: ret
}
define i8 @mand8(i8 %x, i8 %y) {
+; CHECK-LABEL: mand8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: xorl %esi, %eax
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
%ma = bitcast i8 %x to <8 x i1>
%mb = bitcast i8 %y to <8 x i1>
%mc = and <8 x i1> %ma, %mb
%md = xor <8 x i1> %ma, %mb
%me = or <8 x i1> %mc, %md
%ret = bitcast <8 x i1> %me to i8
-; CHECK: kandb
-; CHECK: kxorb
-; CHECK: korb
+ ret i8 %ret
+}
+
+define i8 @mand8_mem(<8 x i1>* %x, <8 x i1>* %y) {
+; CHECK-LABEL: mand8_mem:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb (%rdi), %k0
+; CHECK-NEXT: kmovb (%rsi), %k1
+; CHECK-NEXT: kandb %k1, %k0, %k2
+; CHECK-NEXT: kxorb %k1, %k0, %k0
+; CHECK-NEXT: korb %k0, %k2, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+ %ma = load <8 x i1>, <8 x i1>* %x
+ %mb = load <8 x i1>, <8 x i1>* %y
+ %mc = and <8 x i1> %ma, %mb
+ %md = xor <8 x i1> %ma, %mb
+ %me = or <8 x i1> %mc, %md
+ %ret = bitcast <8 x i1> %me to i8
ret i8 %ret
}
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index 2065322009da..f201082fb1a7 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -1,53 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq -mattr=+avx512vl --show-mc-encoding| FileCheck %s
define <8 x i64> @test_mask_mullo_epi64_rr_512(<8 x i64> %a, <8 x i64> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rr_512
- ;CHECK: vpmullq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrk_512
- ;CHECK: vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rrkz_512(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrkz_512
- ;CHECK: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rm_512(<8 x i64> %a, <8 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rm_512
- ;CHECK: vpmullq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmk_512
- ;CHECK: vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rmkz_512(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmkz_512
- ;CHECK: vpmullq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
ret <8 x i64> %res
}
define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmb_512
- ;CHECK: vpmullq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x58,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x58,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -56,8 +77,12 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
}
define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbk_512
- ;CHECK: vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -66,8 +91,11 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x
}
define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbkz_512
- ;CHECK: vpmullq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
%b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -77,53 +105,73 @@ define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8
declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
define <4 x i64> @test_mask_mullo_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rr_256
- ;CHECK: vpmullq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrk_256
- ;CHECK: vpmullq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrkz_256
- ;CHECK: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rm_256
- ;CHECK: vpmullq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmk_256
- ;CHECK: vpmullq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmkz_256
- ;CHECK: vpmullq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
ret <4 x i64> %res
}
define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmb_256
- ;CHECK: vpmullq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -132,8 +180,12 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
}
define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbk_256
- ;CHECK: vpmullq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x40,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -142,8 +194,11 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x
}
define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbkz_256
- ;CHECK: vpmullq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
%b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -154,53 +209,73 @@ define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8
declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
define <2 x i64> @test_mask_mullo_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rr_128
- ;CHECK: vpmullq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrk_128
- ;CHECK: vpmullq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0xd1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rrkz_128
- ;CHECK: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1]
+; CHECK-LABEL: test_mask_mullo_epi64_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rm_128
- ;CHECK: vpmullq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmk_128
- ;CHECK: vpmullq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmkz_128
- ;CHECK: vpmullq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
ret <2 x i64> %res
}
define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmb_128
- ;CHECK: vpmullq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -209,8 +284,12 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
}
define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbk_128
- ;CHECK: vpmullq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x40,0x0f]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x40,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -219,8 +298,11 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x
}
define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mullo_epi64_rmbkz_128
- ;CHECK: vpmullq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x40,0x07]
+; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x40,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
%b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -231,53 +313,73 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8
declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
define <4 x float> @test_mask_andnot_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rr_128
- ;CHECK: vandnps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrk_128
- ;CHECK: vandnps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1]
+; CHECK-LABEL: test_mask_andnot_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrkz_128
- ;CHECK: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rm_128
- ;CHECK: vandnps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmk_128
- ;CHECK: vandnps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmkz_128
- ;CHECK: vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmb_128
- ;CHECK: vandnps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -286,8 +388,12 @@ define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbk_128
- ;CHECK: vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -296,8 +402,11 @@ define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b,
}
define <4 x float> @test_mask_andnot_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbkz_128
- ;CHECK: vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -308,53 +417,73 @@ define <4 x float> @test_mask_andnot_ps_rmbkz_128(<4 x float> %a, float* %ptr_b,
declare <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_andnot_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rr_256
- ;CHECK: vandnps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrk_256
- ;CHECK: vandnps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1]
+; CHECK-LABEL: test_mask_andnot_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrkz_256
- ;CHECK: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rm_256
- ;CHECK: vandnps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmk_256
- ;CHECK: vandnps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmkz_256
- ;CHECK: vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_andnot_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmb_256
- ;CHECK: vandnps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -363,8 +492,12 @@ define <8 x float> @test_mask_andnot_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbk_256
- ;CHECK: vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -373,8 +506,11 @@ define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b,
}
define <8 x float> @test_mask_andnot_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbkz_256
- ;CHECK: vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -385,53 +521,73 @@ define <8 x float> @test_mask_andnot_ps_rmbkz_256(<8 x float> %a, float* %ptr_b,
declare <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_andnot_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rr_512
- ;CHECK: vandnps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrk_512
- ;CHECK: vandnps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1]
+; CHECK-LABEL: test_mask_andnot_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rrkz_512
- ;CHECK: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1]
+; CHECK-LABEL: test_mask_andnot_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rm_512
- ;CHECK: vandnps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmk_512
- ;CHECK: vandnps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmkz_512
- ;CHECK: vandnps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_andnot_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmb_512
- ;CHECK: vandnps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -440,8 +596,12 @@ define <16 x float> @test_mask_andnot_ps_rmb_512(<16 x float> %a, float* %ptr_b)
}
define <16 x float> @test_mask_andnot_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbk_512
- ;CHECK: vandnps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x55,0x0f]
+; CHECK-LABEL: test_mask_andnot_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x55,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -450,8 +610,11 @@ define <16 x float> @test_mask_andnot_ps_rmbk_512(<16 x float> %a, float* %ptr_b
}
define <16 x float> @test_mask_andnot_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_andnot_ps_rmbkz_512
- ;CHECK: vandnps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x07]
+; CHECK-LABEL: test_mask_andnot_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -462,53 +625,73 @@ define <16 x float> @test_mask_andnot_ps_rmbkz_512(<16 x float> %a, float* %ptr_
declare <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
define <4 x float> @test_mask_and_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_and_ps_rr_128
- ;CHECK: vandps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrk_128
- ;CHECK: vandps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1]
+; CHECK-LABEL: test_mask_and_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrkz_128
- ;CHECK: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rm_128
- ;CHECK: vandps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmk_128
- ;CHECK: vandps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmkz_128
- ;CHECK: vandps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rmb_128
- ;CHECK: vandps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -517,8 +700,12 @@ define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbk_128
- ;CHECK: vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -527,8 +714,11 @@ define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
}
define <4 x float> @test_mask_and_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbkz_128
- ;CHECK: vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -539,53 +729,73 @@ define <4 x float> @test_mask_and_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8
declare <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_and_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_and_ps_rr_256
- ;CHECK: vandps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrk_256
- ;CHECK: vandps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1]
+; CHECK-LABEL: test_mask_and_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrkz_256
- ;CHECK: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rm_256
- ;CHECK: vandps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmk_256
- ;CHECK: vandps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmkz_256
- ;CHECK: vandps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_and_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rmb_256
- ;CHECK: vandps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -594,8 +804,12 @@ define <8 x float> @test_mask_and_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbk_256
- ;CHECK: vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -604,8 +818,11 @@ define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
}
define <8 x float> @test_mask_and_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbkz_256
- ;CHECK: vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -616,53 +833,73 @@ define <8 x float> @test_mask_and_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8
declare <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_and_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_and_ps_rr_512
- ;CHECK: vandps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrk_512
- ;CHECK: vandps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1]
+; CHECK-LABEL: test_mask_and_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rrkz_512
- ;CHECK: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1]
+; CHECK-LABEL: test_mask_and_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rm_512
- ;CHECK: vandps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmk_512
- ;CHECK: vandps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmkz_512
- ;CHECK: vandps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_and_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_ps_rmb_512
- ;CHECK: vandps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -671,8 +908,12 @@ define <16 x float> @test_mask_and_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
}
define <16 x float> @test_mask_and_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbk_512
- ;CHECK: vandps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x54,0x0f]
+; CHECK-LABEL: test_mask_and_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x54,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -681,8 +922,11 @@ define <16 x float> @test_mask_and_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <
}
define <16 x float> @test_mask_and_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_and_ps_rmbkz_512
- ;CHECK: vandps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x07]
+; CHECK-LABEL: test_mask_and_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -693,53 +937,73 @@ define <16 x float> @test_mask_and_ps_rmbkz_512(<16 x float> %a, float* %ptr_b,
declare <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
define <4 x float> @test_mask_or_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_or_ps_rr_128
- ;CHECK: vorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrk_128
- ;CHECK: vorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1]
+; CHECK-LABEL: test_mask_or_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrkz_128
- ;CHECK: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rm_128
- ;CHECK: vorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmk_128
- ;CHECK: vorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmkz_128
- ;CHECK: vorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rmb_128
- ;CHECK: vorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -748,8 +1012,12 @@ define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbk_128
- ;CHECK: vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -758,8 +1026,11 @@ define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x
}
define <4 x float> @test_mask_or_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbkz_128
- ;CHECK: vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -770,53 +1041,73 @@ define <4 x float> @test_mask_or_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8
declare <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_or_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_or_ps_rr_256
- ;CHECK: vorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrk_256
- ;CHECK: vorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1]
+; CHECK-LABEL: test_mask_or_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrkz_256
- ;CHECK: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rm_256
- ;CHECK: vorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmk_256
- ;CHECK: vorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmkz_256
- ;CHECK: vorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_or_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rmb_256
- ;CHECK: vorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -825,8 +1116,12 @@ define <8 x float> @test_mask_or_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbk_256
- ;CHECK: vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -835,8 +1130,11 @@ define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x
}
define <8 x float> @test_mask_or_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbkz_256
- ;CHECK: vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -847,53 +1145,73 @@ define <8 x float> @test_mask_or_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8
declare <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_or_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_or_ps_rr_512
- ;CHECK: vorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrk_512
- ;CHECK: vorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1]
+; CHECK-LABEL: test_mask_or_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rrkz_512
- ;CHECK: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1]
+; CHECK-LABEL: test_mask_or_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rm_512
- ;CHECK: vorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmk_512
- ;CHECK: vorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmkz_512
- ;CHECK: vorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_or_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_ps_rmb_512
- ;CHECK: vorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -902,8 +1220,12 @@ define <16 x float> @test_mask_or_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
}
define <16 x float> @test_mask_or_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbk_512
- ;CHECK: vorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x56,0x0f]
+; CHECK-LABEL: test_mask_or_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x56,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -912,8 +1234,11 @@ define <16 x float> @test_mask_or_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <1
}
define <16 x float> @test_mask_or_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_or_ps_rmbkz_512
- ;CHECK: vorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x07]
+; CHECK-LABEL: test_mask_or_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -924,53 +1249,73 @@ define <16 x float> @test_mask_or_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i
declare <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
define <4 x float> @test_mask_xor_ps_rr_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK-LABEL: test_mask_xor_ps_rr_128
- ;CHECK: vxorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrk_128
- ;CHECK: vxorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1]
+; CHECK-LABEL: test_mask_xor_ps_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrkz_128
- ;CHECK: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rm_128
- ;CHECK: vxorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmk_128
- ;CHECK: vxorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmkz_128
- ;CHECK: vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rmb_128
- ;CHECK: vxorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -979,8 +1324,12 @@ define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
}
define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbk_128
- ;CHECK: vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -989,8 +1338,11 @@ define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
}
define <4 x float> @test_mask_xor_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbkz_128
- ;CHECK: vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <4 x float> undef, float %q, i32 0
%b = shufflevector <4 x float> %vecinit.i, <4 x float> undef, <4 x i32> zeroinitializer
@@ -1001,53 +1353,73 @@ define <4 x float> @test_mask_xor_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8
declare <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mask_xor_ps_rr_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK-LABEL: test_mask_xor_ps_rr_256
- ;CHECK: vxorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrk_256
- ;CHECK: vxorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1]
+; CHECK-LABEL: test_mask_xor_ps_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrkz_256
- ;CHECK: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rm_256
- ;CHECK: vxorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmk_256
- ;CHECK: vxorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmkz_256
- ;CHECK: vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mask_xor_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rmb_256
- ;CHECK: vxorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -1056,8 +1428,12 @@ define <8 x float> @test_mask_xor_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
}
define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbk_256
- ;CHECK: vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -1066,8 +1442,11 @@ define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
}
define <8 x float> @test_mask_xor_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbkz_256
- ;CHECK: vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <8 x float> undef, float %q, i32 0
%b = shufflevector <8 x float> %vecinit.i, <8 x float> undef, <8 x i32> zeroinitializer
@@ -1078,53 +1457,73 @@ define <8 x float> @test_mask_xor_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8
declare <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <16 x float> @test_mask_xor_ps_rr_512(<16 x float> %a, <16 x float> %b) {
- ;CHECK-LABEL: test_mask_xor_ps_rr_512
- ;CHECK: vxorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rr_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrk_512
- ;CHECK: vxorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1]
+; CHECK-LABEL: test_mask_xor_ps_rrk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rrkz_512
- ;CHECK: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1]
+; CHECK-LABEL: test_mask_xor_ps_rrkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rm_512
- ;CHECK: vxorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rm_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmk_512
- ;CHECK: vxorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0x0f]
+; CHECK-LABEL: test_mask_xor_ps_rmk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmkz_512
- ;CHECK: vxorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 %mask)
ret <16 x float> %res
}
define <16 x float> @test_mask_xor_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_ps_rmb_512
- ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07]
+; CHECK-LABEL: test_mask_xor_ps_rmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1133,8 +1532,12 @@ define <16 x float> @test_mask_xor_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
}
define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbk_512
- ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-LABEL: test_mask_xor_ps_rmbk_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x57,0x0f]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1143,8 +1546,11 @@ define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <
}
define <16 x float> @test_mask_xor_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
- ;CHECK-LABEL: test_mask_xor_ps_rmbkz_512
- ;CHECK: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-LABEL: test_mask_xor_ps_rmbkz_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
%vecinit.i = insertelement <16 x float> undef, float %q, i32 0
%b = shufflevector <16 x float> %vecinit.i, <16 x float> undef, <16 x i32> zeroinitializer
@@ -1159,11 +1565,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8]
+; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1175,11 +1581,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8]
+; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1191,11 +1597,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1207,11 +1613,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1223,11 +1629,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8]
+; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1239,11 +1645,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8]
+; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1255,11 +1661,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1271,11 +1677,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1287,11 +1693,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>,
define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1303,11 +1709,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>,
define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1319,11 +1725,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1335,11 +1741,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1351,11 +1757,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1367,11 +1773,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1383,11 +1789,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>,
define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1399,11 +1805,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>,
define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1415,11 +1821,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1431,11 +1837,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1447,11 +1853,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1463,11 +1869,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>
define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1479,11 +1885,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1495,11 +1901,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1511,11 +1917,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -1527,11 +1933,11 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -1540,12 +1946,14 @@ define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x
declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducepd {{.*}}{%k1}
-; CHECK: vreducepd
define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreducepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x56,0xc8,0x04]
+; CHECK-NEXT: vreducepd $8, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x56,0xc0,0x08]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double> %x0, i32 8, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1554,12 +1962,14 @@ define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2
declare <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double>, i32, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreducepd {{.*}}{%k1}
-; CHECK: vreducepd
define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreducepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x56,0xc8,0x04]
+; CHECK-NEXT: vreducepd $0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x56,0xc0,0x00]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double> %x0, i32 0, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1567,12 +1977,15 @@ define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4
}
declare <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float>, i32, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreduceps {{.*}}{%k1}
-; CHECK: vreduceps
+
define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreduceps $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x56,0xc8,0x04]
+; CHECK-NEXT: vreduceps $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x56,0xc0,0x58]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1581,12 +1994,14 @@ define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x
declare <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float>, i32, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_reduce_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vreduceps {{.*}}{%k1}
-; CHECK: vreduceps
define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vreduceps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x56,0xc8,0x0b]
+; CHECK-NEXT: vreduceps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x56,0xc0,0x0b]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1595,12 +2010,14 @@ define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x
declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangepd {{.*}}{%k1}
-; CHECK: vrangepd
define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangepd $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangepd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x50,0xc1,0x08]
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 4, <2 x double> %x3, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double> %x0, <2 x double> %x1, i32 8, <2 x double> %x3, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -1609,12 +2026,14 @@ define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x
declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangepd {{.*}}{%k1}
-; CHECK: vrangepd
define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangepd $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangepd $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x50,0xc1,0x58]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 4, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double> %x0, <4 x double> %x1, i32 88, <4 x double> %x3, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -1623,12 +2042,14 @@ define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x
declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangeps {{.*}}{%k1}
-; CHECK: vrangeps
define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangeps $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangeps $88, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x50,0xc1,0x58]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 4, <4 x float> %x3, i8 %x4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float> %x0, <4 x float> %x1, i32 88, <4 x float> %x3, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -1637,12 +2058,14 @@ define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x f
declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_range_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrangeps {{.*}}{%k1}
-; CHECK: vrangeps
define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vrangeps $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x50,0xd1,0x04]
+; CHECK-NEXT: vrangeps $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x50,0xc1,0x58]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 4, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float> %x0, <8 x float> %x1, i32 88, <8 x float> %x3, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -1654,13 +2077,13 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32,
define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc2,0x01]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc0,0x01]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
%res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
@@ -1674,13 +2097,13 @@ declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x do
define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xd9,0x01]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x18,0xc1,0x01]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
%res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4)
@@ -1694,13 +2117,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i3
define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xd9,0x01]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x38,0xc1,0x01]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4)
@@ -1711,14 +2134,17 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclassps
-; CHECK: {%k1}
-; CHECK: vfpclassps
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclassps $2, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclassps $4, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -1727,14 +2153,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclassps
-; CHECK: {%k1}
-; CHECK: vfpclassps
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclassps $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclassps $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -1743,14 +2172,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasspd
-; CHECK: {%k1}
-; CHECK: vfpclasspd
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclasspd $4, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclasspd $2, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1)
%res2 = add i8 %res, %res1
@@ -1759,14 +2191,17 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) {
declare i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double>, i32, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_fpclass_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vfpclasspd
-; CHECK: {%k1}
-; CHECK: vfpclasspd
-; CHECK: kmovb %k0
define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vfpclasspd $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x66,0xc0,0x02]
+; CHECK-NEXT: kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT: vfpclasspd $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x66,0xc0,0x04]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1)
%res2 = add i8 %res, %res1
@@ -1778,13 +2213,13 @@ declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x f
define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm0
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x19,0xc8]
+; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
+; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x19,0xc0]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3)
%res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
@@ -1795,17 +2230,20 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0,
declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3, i64 * %y_ptr) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3)
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vbroadcasti32x2 (%rsi), %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x59,0xd0]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x59,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %y_64 = load i64, i64 * %y_ptr
+ %y_v2i64 = insertelement <2 x i64> undef, i64 %y_64, i32 0
+ %y = bitcast <2 x i64> %y_v2i64 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %y, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
@@ -1818,13 +2256,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
+; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x59,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1)
@@ -1838,9 +2276,10 @@ declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>)
define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovd2m %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovd2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
ret i8 %res
}
@@ -1850,9 +2289,10 @@ declare i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32>)
define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovd2m %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovd2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0)
ret i8 %res
}
@@ -1862,9 +2302,10 @@ declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovq2m %xmm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovq2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
ret i8 %res
}
@@ -1874,9 +2315,10 @@ declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovq2m %ymm0, %k0
-; CHECK-NEXT: kmovb %k0, %eax
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpmovq2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0]
+; CHECK-NEXT: kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0)
ret i8 %res
}
@@ -1886,9 +2328,9 @@ declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8)
define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2d %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8 %x0)
ret <4 x i32> %res
}
@@ -1898,9 +2340,9 @@ declare <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8)
define <8 x i32>@test_int_x86_avx512_cvtmask2d_256(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2d %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2d %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8 %x0)
ret <8 x i32> %res
}
@@ -1910,9 +2352,9 @@ declare <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8)
define <2 x i64>@test_int_x86_avx512_cvtmask2q_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2q %k0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8 %x0)
ret <2 x i64> %res
}
@@ -1922,9 +2364,9 @@ declare <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8)
define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k0
-; CHECK-NEXT: vpmovm2q %k0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
+; CHECK-NEXT: vpmovm2q %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x38,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0)
ret <4 x i64> %res
}
@@ -1932,12 +2374,18 @@ declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshuff64x2 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK: vaddpd %ymm0, %ymm2, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1)
%res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask)
@@ -1951,12 +2399,18 @@ declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>,
define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256:
-; CHECK: kmovb %edi, %k1
-; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshufi64x2 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vpaddq %ymm1, %ymm0, %ymm0
-; CHECK: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1]
+; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1]
+; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask)
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
new file mode 100644
index 000000000000..685817cbe265
--- /dev/null
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm3 {%k1}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
+; CHECK: kmovw %edi, %k1
+; CHECK: vmovaps %zmm0, %zmm3
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK: vmovaps %zmm0, %zmm4
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm4
+; CHECK: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK: vpaddq %zmm2, %zmm4, %zmm1
+; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ %res4 = add <8 x i64> %res, %res1
+ %res5 = add <8 x i64> %res3, %res2
+ %res6 = add <8 x i64> %res5, %res4
+ ret <8 x i64> %res6
+}
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
new file mode 100644
index 000000000000..1d5febfec4f3
--- /dev/null
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -0,0 +1,226 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
+
+declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+
+ %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res4 = add <2 x i64> %res, %res1
+ %res5 = add <2 x i64> %res3, %res2
+ %res6 = add <2 x i64> %res5, %res4
+ ret <2 x i64> %res6
+}
+
+declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+
+ %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res4 = add <4 x i64> %res, %res1
+ %res5 = add <4 x i64> %res3, %res2
+ %res6 = add <4 x i64> %res5, %res4
+ ret <4 x i64> %res6
+}
diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
new file mode 100644
index 000000000000..ce999855d1f1
--- /dev/null
+++ b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512vbmi | FileCheck %s
+declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddb %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512:
+; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm0
+; CHECK: vpaddb %zmm3, %zmm2, %zmm1
+; CHECK: vpaddb %zmm0, %zmm1, %zmm0
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovaps %zmm1, %zmm3
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
+; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0
+; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res3, %res2
+ ret <64 x i8> %res4
+}
+
+declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
new file mode 100644
index 000000000000..b68e71110210
--- /dev/null
+++ b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512vl -mattr=+avx512vbmi --show-mc-encoding| FileCheck %s
+declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0]
+; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x8d,0xd8]
+; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0x8d,0xc0]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0]
+; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xd8]
+; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0x8d,0xc0]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1]
+; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x83,0xd9]
+; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1]
+; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xd9]
+; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfc,0xcb]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xca]
+; CHECK-NEXT: vpxord %xmm4, %xmm4, %xmm4 ## encoding: [0x62,0xf1,0x5d,0x08,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %xmm4, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xca]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %ymm4, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xca]
+; CHECK-NEXT: vpxord %xmm4, %xmm4, %xmm4 ## encoding: [0x62,0xf1,0x5d,0x08,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %xmm4, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res3, %res2
+ ret <16 x i8> %res4
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xda]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xca]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2]
+; CHECK-NEXT: vpaddb %ymm4, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfc,0xc4]
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfc,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res3, %res2
+ ret <32 x i8> %res4
+}
+
+declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xca]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ ret <16 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xca]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..310ed8f50c4e
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -0,0 +1,1391 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
+
+define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res1 = bitcast <4 x i32> %res0 to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp0:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x i32> %res0, <4 x i32> %arg0
+ %res2 = bitcast <4 x i32> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp1:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x i32> %res0, <4 x i32> zeroinitializer
+ %res2 = bitcast <4 x i32> %res1 to <2 x i64>
+ ret <2 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastd %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
+ %res1 = bitcast <8 x i32> %res0 to <4 x i64>
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
+ %res2 = bitcast <8 x i32> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastd_epi32:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
+ %res2 = bitcast <8 x i32> %res1 to <4 x i64>
+ ret <4 x i64> %res2
+}
+
+define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_mask_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp2:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg1, <2 x i64> %res0, <2 x i64> %a0
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp3:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn0 to <2 x i1>
+ %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg0, <2 x i64> %res0, <2 x i64> zeroinitializer
+ ret <2 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm256_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpbroadcastq %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpbroadcastq %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp4:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp5:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
+ ret <4 x i64> %res1
+}
+
+define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_mask_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp6:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp7:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn0 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
+ ret <2 x double> %res1
+}
+
+define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm256_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp8:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastsd_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp9:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_mask_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp10:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_maskz_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp11:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm256_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: vbroadcastss %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: vbroadcastss %xmm0, %ymm0
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm256_mask_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm256_maskz_broadcastss_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_broadcastss_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_mask_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp12:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_maskz_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp13:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
+ ret <2 x double> %res1
+}
+
+define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
+; X32-LABEL: test_mm256_mask_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp14:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
+; X32-LABEL: test_mm256_maskz_movddup_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp15:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_movddup_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_mask_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp16:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_maskz_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp17:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
+; X32-LABEL: test_mm256_mask_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
+; X32-LABEL: test_mm256_maskz_movehdup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_movehdup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_mask_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp18:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_maskz_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp19:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
+; X32-LABEL: test_mm256_mask_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
+; X32-LABEL: test_mm256_maskz_moveldup_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_moveldup_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
+; X32-LABEL: test_mm256_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_mask_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp20:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
+ ret <4 x i64> %res1
+}
+
+define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
+; X32-LABEL: test_mm256_maskz_permutex_epi64:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp21:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_permutex_epi64:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
+ ret <4 x i64> %res1
+}
+
+define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
+; X32-LABEL: test_mm256_mask_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp22:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
+; X32-LABEL: test_mm256_maskz_permutex_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp23:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_permutex_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) {
+; X32-LABEL: test_mm_mask_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp24:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i2
+ %arg1 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a2, <2 x double> %a3, <2 x i32> <i32 1, i32 3>
+ %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_maskz_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp25:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i2
+ %arg0 = bitcast i2 %trn1 to <2 x i1>
+ %res0 = shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 1, i32 3>
+ %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
+ ret <2 x double> %res1
+}
+
+define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
+; X32-LABEL: test_mm256_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) {
+; X32-LABEL: test_mm256_mask_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp26:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a2, <4 x double> %a3, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
+ ret <4 x double> %res1
+}
+
+define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
+; X32-LABEL: test_mm256_maskz_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp27:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x double> %a1, <4 x double> %a2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
+ ret <4 x double> %res1
+}
+
+define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) {
+; X32-LABEL: test_mm_mask_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp28:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
+; X64-NEXT: retq
+ %trn1 = trunc i8 %a1 to i4
+ %arg1 = bitcast i4 %trn1 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a2, <4 x float> %a3, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+ %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
+ ret <4 x float> %res1
+}
+
+define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_maskz_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: .Ltmp29:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movb %al, (%esp)
+; X32-NEXT: movzbl (%esp), %eax
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: andb $15, %dil
+; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
+; X64-NEXT: retq
+ %trn0 = trunc i8 %a0 to i4
+ %arg0 = bitcast i4 %trn0 to <4 x i1>
+ %res0 = shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+ %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
+ ret <4 x float> %res1
+}
+
+define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
+; X32-LABEL: test_mm256_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X64-NEXT: retq
+ %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
+; X32-LABEL: test_mm256_mask_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
+; X64-NEXT: retq
+ %arg1 = bitcast i8 %a1 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+ %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
+ ret <8 x float> %res1
+}
+
+define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
+; X32-LABEL: test_mm256_maskz_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast i8 %a0 to <8 x i1>
+ %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+ %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
+ ret <8 x float> %res1
+}
+
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..f9126b4614eb
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -0,0 +1,2536 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8]
+; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0]
+; CHECK-NEXT: vpaddd (%rsi){1to8}, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x38,0xfe,0x0e]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %y_32 = load i32, i32 * %y_ptr
+ %y = insertelement <4 x i32> undef, i32 %y_32, i32 0
+ %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %y, <8 x i32> %x1, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x58,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc9]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x59,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
+; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc9]
+; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x59,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
+; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc9]
+; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xd4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x19,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
+; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc9]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x18,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
+; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc9]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) {
+; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x18,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
+; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc9]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsldup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x12,0xd0]
+; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovsldup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x12,0xd0]
+; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovshdup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x16,0xd0]
+; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovshdup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x16,0xd0]
+; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovddup %xmm0, %xmm2 ## encoding: [0x62,0xf1,0xff,0x08,0x12,0xd0]
+; CHECK-NEXT: ## xmm2 = xmm0[0,0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0]
+; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xca]
+; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res2, %res3
+ ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovddup %ymm0, %ymm2 ## encoding: [0x62,0xf1,0xff,0x28,0x12,0xd0]
+; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xca]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x05,0xd0,0x06]
+; CHECK-NEXT: ## ymm2 = ymm0[0,1,3,2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,3,2]
+; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,3,2]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 ## encoding: [0x62,0xf3,0xfd,0x08,0x05,0xd0,0x01]
+; CHECK-NEXT: ## xmm2 = xmm0[1,0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,0]
+; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0x7d,0x28,0x04,0xd0,0x16]
+; CHECK-NEXT: ## ymm2 = ymm0[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 ## encoding: [0x62,0xf3,0x7d,0x08,0x04,0xd0,0x16]
+; CHECK-NEXT: ## xmm2 = xmm0[2,1,1,0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[2,1,1,0]
+; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[2,1,1,0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x01,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0]
+; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x00,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0]
+; CHECK-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8)
+
+define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovapd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x29,0x07]
+; CHECK-NEXT: vmovapd %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x08,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.256(i8*, <4 x double>, i8)
+
+define void@test_int_x86_avx512_mask_store_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovapd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x29,0x07]
+; CHECK-NEXT: vmovapd %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x28,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.128(i8*, <2 x double>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovupd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x11,0x07]
+; CHECK-NEXT: vmovupd %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x08,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.256(i8*, <4 x double>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovupd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x11,0x07]
+; CHECK-NEXT: vmovupd %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x28,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.128(i8*, <4 x float>, i8)
+
+define void@test_int_x86_avx512_mask_store_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovaps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x29,0x07]
+; CHECK-NEXT: vmovaps %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x08,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.256(i8*, <8 x float>, i8)
+
+define void@test_int_x86_avx512_mask_store_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovaps %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07]
+; CHECK-NEXT: vmovaps %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x28,0x29,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.128(i8*, <4 x float>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovups %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x11,0x07]
+; CHECK-NEXT: vmovups %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x08,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.256(i8*, <8 x float>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovups %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x11,0x07]
+; CHECK-NEXT: vmovups %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x28,0x11,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.128(i8*, <2 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu64 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfe,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.256(i8*, <4 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu64 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfe,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.128(i8*, <4 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqu32 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7e,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.256(i8*, <8 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqu32 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7e,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.128(i8*, <2 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_store_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqa64 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.256(i8*, <4 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_store_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqa64 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.128(i8*, <4 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_store_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7f,0x07]
+; CHECK-NEXT: vmovdqa32 %xmm0, (%rsi) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.256(i8*, <8 x i32>, i8)
+
+define void@test_int_x86_avx512_mask_store_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7f,0x07]
+; CHECK-NEXT: vmovdqa32 %ymm0, (%rsi) ## encoding: [0x62,0xf1,0x7d,0x28,0x7f,0x06]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2)
+ call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1)
+ ret void
+}
+
+define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x0f]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x float> %res2, %res1
+ ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8)
+
+define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07]
+; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x0f]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x float> %res2, %res1
+ ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8)
+
+define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
+; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x0f]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x double> %res2, %res1
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8)
+
+define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
+; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x0f]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x double> %res2, %res1
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8)
+
+define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x0f]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x float> %res2, %res1
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8)
+
+define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
+; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x0f]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x float> %res2, %res1
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8)
+
+define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
+; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x0f]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <2 x double> %res2, %res1
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8)
+
+define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x10,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
+; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x0f]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <2 x double> %res2, %res1
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8*, <2 x double>, i8)
+
+declare <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8*, <4 x i32>, i8)
+
+define <4 x i32> @test_mask_load_unaligned_d_128(i8* %ptr, i8* %ptr2, <4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr2, <4 x i32> %res, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i32> %res2, %res1
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8*, <8 x i32>, i8)
+
+define <8 x i32> @test_mask_load_unaligned_d_256(i8* %ptr, i8* %ptr2, <8 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr2, <8 x i32> %res, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i32> %res2, %res1
+ ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8*, <2 x i64>, i8)
+
+define <2 x i64> @test_mask_load_unaligned_q_128(i8* %ptr, i8* %ptr2, <2 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x06]
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr2, <2 x i64> %res, i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <2 x i64> %res2, %res1
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8*, <4 x i64>, i8)
+
+define <4 x i64> @test_mask_load_unaligned_q_256(i8* %ptr, i8* %ptr2, <4 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x06]
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr2, <4 x i64> %res, i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i64> %res2, %res1
+ ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8*, <4 x i32>, i8)
+
+define <4 x i32> @test_mask_load_aligned_d_128(<4 x i32> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> %res, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i32> %res2, %res1
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8*, <8 x i32>, i8)
+
+define <8 x i32> @test_mask_load_aligned_d_256(<8 x i32> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> %res, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i32> %res2, %res1
+ ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8*, <2 x i64>, i8)
+
+define <2 x i64> @test_mask_load_aligned_q_128(<2 x i64> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> %res, i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <2 x i64> %res2, %res1
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8*, <4 x i64>, i8)
+
+define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %mask) {
+; CHECK-LABEL: test_mask_load_aligned_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x0f]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> %res, i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i64> %res2, %res1
+ ret <4 x i64> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7d,0x08,0x70,0xd0,0x03]
+; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0]
+; CHECK-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03]
+; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 ## encoding: [0x62,0xf1,0x7d,0x28,0x70,0xd0,0x03]
+; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03]
+; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+ ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1]
+; CHECK-NEXT: kshiftlw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
+; CHECK-NEXT: kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+ ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
+
+declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x15,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x15,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x14,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+ %res2 = fadd <2 x double> %res, %res1
+ ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x14,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+ %res2 = fadd <4 x double> %res, %res1
+ ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x14,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+ %res2 = fadd <4 x float> %res, %res1
+ ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x14,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+ %res2 = fadd <8 x float> %res, %res1
+ ret <8 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x6a,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7d,0x08,0x62,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x6a,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7d,0x28,0x62,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6d,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1]
+; CHECK-NEXT: ## xmm2 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0x6c,0xd9]
+; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1]
+; CHECK-NEXT: ## xmm2 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6c,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1]
+; CHECK-NEXT: ## ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0x6d,0xd9]
+; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1]
+; CHECK-NEXT: ## ymm2 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+
+define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_and_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_and_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_and_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_and_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_and_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_and_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_or_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_or_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_or_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_or_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_or_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_or_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_xor_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_xor_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_xor_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_xor_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_xor_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i32>, <4 x i32>* %ptr_b
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_andnot_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <8 x i32>, <8 x i32>* %ptr_b
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_andnot_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i32, i32* %ptr_b
+ %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+ %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <2 x i64>, <2 x i64>* %ptr_b
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <2 x i64>, <2 x i64>* %ptr_b
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <2 x i64>, <2 x i64>* %ptr_b
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
+ ret <2 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i64>, <4 x i64>* %ptr_b
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i64>, <4 x i64>* %ptr_b
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %b = load <4 x i64>, <4 x i64>* %ptr_b
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %q = load i64, i64* %ptr_b
+ %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+ %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
+ ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index a4f3e666833a..41376cf602c4 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -1,124 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
; 256-bit
-define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: test_pcmpeq_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
-
-define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: test_pcmpeq_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
-
-define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: test_pcmpgt_d_256
-; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d_256
-; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
-
-define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: test_pcmpgt_q_256
-; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q_256
-; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
-
define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_cmp_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_d_256
-; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordd %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltd %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnled %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -127,58 +101,95 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_ucmp_d_256
-; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_d_256
-; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordud %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltud %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -187,58 +198,95 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_cmp_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_cmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_q_256
-; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -247,58 +295,95 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_ucmp_q_256
-; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
+; CHECK-LABEL: test_ucmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_q_256
-; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -308,123 +393,96 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounw
; 128-bit
-define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_pcmpeq_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
-
-define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_pcmpeq_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
-
-define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: test_pcmpgt_d_128
-; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d_128
-; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
-
-define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_pcmpgt_q_128
-; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
- ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q_128
-; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ##
- %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
- ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
-
define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_cmp_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_d_128
-; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltd %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordd %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltd %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnled %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -433,58 +491,95 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_ucmp_d_128
-; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_d_128
-; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordud %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -493,58 +588,95 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_cmp_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_cmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x01]
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02]
+; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x03]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf1,0x06]
+; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_q_128
-; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_cmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x00]
+; CHECK-NEXT: vpcmpltq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd1,0x01]
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02]
+; CHECK-NEXT: vpcmpunordq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x03]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf9,0x06]
+; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -553,58 +685,95 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_ucmp_q_128
-; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
+; CHECK-LABEL: test_ucmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01]
+; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02]
+; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd9,0x03]
+; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x04]
+; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x05]
+; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf1,0x06]
+; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
}
define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_q_128
-; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
+; CHECK-LABEL: test_mask_ucmp_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x00]
+; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01]
+; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02]
+; CHECK-NEXT: vpcmpunorduq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe1,0x03]
+; CHECK-NEXT: vpcmpnequq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc9,0x07]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x00]
+; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
%vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
%res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
%vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
%res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
%vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
%res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
%vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
%res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
%vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
%res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
%vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
%res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
%vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
%res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
%vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
ret <8 x i8> %vec7
@@ -612,87 +781,114 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
-; CHECK-LABEL: compr1
-; CHECK: vcompresspd %zmm0
define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: compr1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8a,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
-; CHECK-LABEL: compr2
-; CHECK: vcompresspd %ymm0
define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
+; CHECK-LABEL: compr2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompresspd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
-; CHECK-LABEL: compr3
-; CHECK: vcompressps %xmm0
define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
+; CHECK-LABEL: compr3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompressps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8a,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
-; CHECK-LABEL: compr4
-; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: compr4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
-; CHECK-LABEL: compr5
-; CHECK: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
+; CHECK-LABEL: compr5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
-; CHECK-LABEL: compr6
-; CHECK: vcompressps %xmm0
define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
+; CHECK-LABEL: compr6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcompressps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8a,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
-; CHECK-LABEL: compr7
-; CHECK-NOT: vcompress
-; CHECK: vmovupd
define void @compr7(i8* %addr, <8 x double> %data) {
+; CHECK-LABEL: compr7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
ret void
}
-; CHECK-LABEL: compr8
-; CHECK-NOT: vcompressps %xmm0
define <4 x float> @compr8(<4 x float> %data) {
+; CHECK-LABEL: compr8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
-; CHECK-LABEL: compr9
-; CHECK: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: compr9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
ret void
}
declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
-; CHECK-LABEL: compr10
-; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: compr10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
@@ -701,217 +897,188 @@ declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32
; Expand
-; CHECK-LABEL: expand1
-; CHECK: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: expand1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
-; CHECK-LABEL: expand2
-; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
+; CHECK-LABEL: expand2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
-; CHECK-LABEL: expand3
-; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
+; CHECK-LABEL: expand3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
-; CHECK-LABEL: expand4
-; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: expand4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
ret <8 x double> %res
}
declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
-; CHECK-LABEL: expand5
-; CHECK: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
+; CHECK-LABEL: expand5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
-; CHECK-LABEL: expand6
-; CHECK: vexpandps %xmm0
define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
+; CHECK-LABEL: expand6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vexpandps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x88,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
-; CHECK-LABEL: expand7
-; CHECK-NOT: vexpand
-; CHECK: vmovupd
define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
+; CHECK-LABEL: expand7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
ret <8 x double> %res
}
-; CHECK-LABEL: expand8
-; CHECK-NOT: vexpandps %xmm0
define <4 x float> @expand8(<4 x float> %data) {
+; CHECK-LABEL: expand8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
-; CHECK-LABEL: expand9
-; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: expand9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
ret <8 x i64> %res
}
declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
-; CHECK-LABEL: expand10
-; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: expand10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
-define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: vblendmps %ymm1, %ymm0
- %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly
-
-define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
- ; CHECK: vblendmpd %ymm1, %ymm0
- %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-
-define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) {
- ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop
- ; CHECK: vblendmpd (%
- %b = load <4 x double>, <4 x double>* %ptr
- %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly
-
-; CHECK-LABEL: test_x86_mask_blend_d_256
-; CHECK: vpblendmd
-define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) {
- %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly
-
-define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) {
- ; CHECK: vpblendmq
- %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly
-
-define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: vblendmps %xmm1, %xmm0
- %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
-
-define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: vblendmpd %xmm1, %xmm0
- %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-
-define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) {
- ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop
- ; CHECK: vblendmpd (%
- %b = load <2 x double>, <2 x double>* %ptr
- %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly
-
-define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpblendmd
- %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly
-
-define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) {
- ; CHECK: vpblendmq
- %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly
-
-
define < 2 x i64> @test_mask_mul_epi32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rr_128
- ;CHECK: vpmuldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrk_128
- ;CHECK: vpmuldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0xd1]
+; CHECK-LABEL: test_mask_mul_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrkz_128
- ;CHECK: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rm_128
- ;CHECK: vpmuldq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmk_128
- ;CHECK: vpmuldq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmkz_128
- ;CHECK: vpmuldq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmb_128
- ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -921,8 +1088,12 @@ define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
}
define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbk_128
- ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -932,8 +1103,11 @@ define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
}
define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbkz_128
- ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer
@@ -945,53 +1119,73 @@ define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8
declare < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32>, < 4 x i32>, < 2 x i64>, i8)
define < 4 x i64> @test_mask_mul_epi32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rr_256
- ;CHECK: vpmuldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrk_256
- ;CHECK: vpmuldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0xd1]
+; CHECK-LABEL: test_mask_mul_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rrkz_256
- ;CHECK: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1]
+; CHECK-LABEL: test_mask_mul_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rm_256
- ;CHECK: vpmuldq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmk_256
- ;CHECK: vpmuldq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmkz_256
- ;CHECK: vpmuldq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmb_256
- ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1001,8 +1195,12 @@ define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
}
define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbk_256
- ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x28,0x0f]
+; CHECK-LABEL: test_mask_mul_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x28,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1012,8 +1210,11 @@ define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
}
define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epi32_rmbkz_256
- ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x07]
+; CHECK-LABEL: test_mask_mul_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1025,53 +1226,73 @@ define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8
declare < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32>, < 8 x i32>, < 4 x i64>, i8)
define < 2 x i64> @test_mask_mul_epu32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rr_128
- ;CHECK: vpmuludq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrk_128
- ;CHECK: vpmuludq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xd1]
+; CHECK-LABEL: test_mask_mul_epu32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrkz_128
- ;CHECK: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rm_128
- ;CHECK: vpmuludq (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmk_128
- ;CHECK: vpmuludq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmkz_128
- ;CHECK: vpmuludq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
ret < 2 x i64> %res
}
define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmb_128
- ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -1081,8 +1302,12 @@ define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
}
define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbk_128
- ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
@@ -1092,8 +1317,11 @@ define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
}
define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbkz_128
- ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer
@@ -1105,53 +1333,73 @@ define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8
declare < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32>, < 4 x i32>, < 2 x i64>, i8)
define < 4 x i64> @test_mask_mul_epu32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rr_256
- ;CHECK: vpmuludq %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrk_256
- ;CHECK: vpmuludq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xd1]
+; CHECK-LABEL: test_mask_mul_epu32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rrkz_256
- ;CHECK: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]
+; CHECK-LABEL: test_mask_mul_epu32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rm_256
- ;CHECK: vpmuludq (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmk_256
- ;CHECK: vpmuludq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmkz_256
- ;CHECK: vpmuludq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
ret < 4 x i64> %res
}
define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmb_256
- ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1161,8 +1409,12 @@ define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
}
define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbk_256
- ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x0f]
+; CHECK-LABEL: test_mask_mul_epu32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1172,8 +1424,11 @@ define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
}
define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_mul_epu32_rmbkz_256
- ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x07]
+; CHECK-LABEL: test_mask_mul_epu32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
%vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
%b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
@@ -1185,53 +1440,73 @@ define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8
declare < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32>, < 8 x i32>, < 4 x i64>, i8)
define <4 x i32> @test_mask_add_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_add_epi32_rr_128
- ;CHECK: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrk_128
- ;CHECK: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xd1]
+; CHECK-LABEL: test_mask_add_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrkz_128
- ;CHECK: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rm_128
- ;CHECK: vpaddd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmk_128
- ;CHECK: vpaddd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmkz_128
- ;CHECK: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rmb_128
- ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1240,8 +1515,12 @@ define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbk_128
- ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1250,8 +1529,11 @@ define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
}
define <4 x i32> @test_mask_add_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbkz_128
- ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1262,53 +1544,73 @@ define <4 x i32> @test_mask_add_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %m
declare <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
define <4 x i32> @test_mask_sub_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rr_128
- ;CHECK: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrk_128
- ;CHECK: vpsubd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0xd1]
+; CHECK-LABEL: test_mask_sub_epi32_rrk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrkz_128
- ;CHECK: vpsubd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rrkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rm_128
- ;CHECK: (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rm_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmk_128
- ;CHECK: vpsubd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmkz_128
- ;CHECK: vpsubd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
}
define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmb_128
- ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1317,8 +1619,12 @@ define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
}
define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbk_128
- ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmbk_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1327,8 +1633,11 @@ define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
}
define <4 x i32> @test_mask_sub_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbkz_128
- ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmbkz_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -1339,53 +1648,73 @@ define <4 x i32> @test_mask_sub_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %m
declare <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
define <8 x i32> @test_mask_sub_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rr_256
- ;CHECK: vpsubd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrk_256
- ;CHECK: vpsubd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0xd1]
+; CHECK-LABEL: test_mask_sub_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rrkz_256
- ;CHECK: vpsubd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0xc1]
+; CHECK-LABEL: test_mask_sub_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rm_256
- ;CHECK: vpsubd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmk_256
- ;CHECK: vpsubd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmkz_256
- ;CHECK: vpsubd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmb_256
- ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1394,8 +1723,12 @@ define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbk_256
- ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x0f]
+; CHECK-LABEL: test_mask_sub_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1404,8 +1737,11 @@ define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
}
define <8 x i32> @test_mask_sub_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_sub_epi32_rmbkz_256
- ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x07]
+; CHECK-LABEL: test_mask_sub_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1416,53 +1752,73 @@ define <8 x i32> @test_mask_sub_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %m
declare <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
define <8 x i32> @test_mask_add_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_add_epi32_rr_256
- ;CHECK: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrk_256
- ;CHECK: vpaddd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xd1]
+; CHECK-LABEL: test_mask_add_epi32_rrk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rrkz_256
- ;CHECK: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
+; CHECK-LABEL: test_mask_add_epi32_rrkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rm_256
- ;CHECK: vpaddd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rm_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmk_256
- ;CHECK: vpaddd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmkz_256
- ;CHECK: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
ret <8 x i32> %res
}
define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_add_epi32_rmb_256
- ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1471,8 +1827,12 @@ define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
}
define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbk_256
- ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x0f]
+; CHECK-LABEL: test_mask_add_epi32_rmbk_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x0f]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1481,8 +1841,11 @@ define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
}
define <8 x i32> @test_mask_add_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_add_epi32_rmbkz_256
- ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x07]
+; CHECK-LABEL: test_mask_add_epi32_rmbkz_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
%b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -1492,1079 +1855,443 @@ define <8 x i32> @test_mask_add_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %m
declare <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_and_epi32_rr_128
- ;CHECK: vpandd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrk_128
- ;CHECK: vpandd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrkz_128
- ;CHECK: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rm_128
- ;CHECK: vpandd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmk_128
- ;CHECK: vpandd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmkz_128
- ;CHECK: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rmb_128
- ;CHECK: vpandd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbk_128
- ;CHECK: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_and_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbkz_128
- ;CHECK: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_and_epi32_rr_256
- ;CHECK: vpandd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrk_256
- ;CHECK: vpandd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rrkz_256
- ;CHECK: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rm_256
- ;CHECK: vpandd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmk_256
- ;CHECK: vpandd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmkz_256
- ;CHECK: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_and_epi32_rmb_256
- ;CHECK: vpandd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbk_256
- ;CHECK: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_and_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_and_epi32_rmbkz_256
- ;CHECK: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_or_epi32_rr_128
- ;CHECK: vpord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrk_128
- ;CHECK: vpord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrkz_128
- ;CHECK: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rm_128
- ;CHECK: vpord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmk_128
- ;CHECK: vpord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmkz_128
- ;CHECK: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rmb_128
- ;CHECK: vpord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbk_128
- ;CHECK: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_or_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbkz_128
- ;CHECK: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_or_epi32_rr_256
- ;CHECK: vpord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrk_256
- ;CHECK: vpord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rrkz_256
- ;CHECK: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rm_256
- ;CHECK: vpord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmk_256
- ;CHECK: vpord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmkz_256
- ;CHECK: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_or_epi32_rmb_256
- ;CHECK: vpord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbk_256
- ;CHECK: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_or_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_or_epi32_rmbkz_256
- ;CHECK: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rr_128
- ;CHECK: vpxord %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrk_128
- ;CHECK: vpxord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrkz_128
- ;CHECK: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rm_128
- ;CHECK: vpxord (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmk_128
- ;CHECK: vpxord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmkz_128
- ;CHECK: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmb_128
- ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbk_128
- ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_128
- ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rr_256
- ;CHECK: vpxord %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrk_256
- ;CHECK: vpxord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rrkz_256
- ;CHECK: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rm_256
- ;CHECK: vpxord (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmk_256
- ;CHECK: vpxord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmkz_256
- ;CHECK: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmb_256
- ;CHECK: vpxord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbk_256
- ;CHECK: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_xor_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_256
- ;CHECK: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rr_128
- ;CHECK: vpandnd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrk_128
- ;CHECK: vpandnd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrkz_128
- ;CHECK: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rm_128
- ;CHECK: vpandnd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmk_128
- ;CHECK: vpandnd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmkz_128
- ;CHECK: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07]
- %b = load <4 x i32>, <4 x i32>* %ptr_b
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmb_128
- ;CHECK: vpandnd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbk_128
- ;CHECK: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
- ret <4 x i32> %res
-}
-
-define <4 x i32> @test_mask_andnot_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbkz_128
- ;CHECK: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
- %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
- %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rr_256
- ;CHECK: vpandnd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrk_256
- ;CHECK: vpandnd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rrkz_256
- ;CHECK: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rm_256
- ;CHECK: vpandnd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmk_256
- ;CHECK: vpandnd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmkz_256
- ;CHECK: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07]
- %b = load <8 x i32>, <8 x i32>* %ptr_b
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmb_256
- ;CHECK: vpandnd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbk_256
- ;CHECK: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
- ret <8 x i32> %res
-}
-
-define <8 x i32> @test_mask_andnot_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi32_rmbkz_256
- ;CHECK: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07]
- %q = load i32, i32* %ptr_b
- %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
- %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
- %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
- ret <8 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rr_128
- ;CHECK: vpandnq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrk_128
- ;CHECK: vpandnq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrkz_128
- ;CHECK: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rm_128
- ;CHECK: vpandnq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07]
- %b = load <2 x i64>, <2 x i64>* %ptr_b
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmk_128
- ;CHECK: vpandnq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f]
- %b = load <2 x i64>, <2 x i64>* %ptr_b
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmkz_128
- ;CHECK: vpandnq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07]
- %b = load <2 x i64>, <2 x i64>* %ptr_b
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmb_128
- ;CHECK: vpandnq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
- %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbk_128
- ;CHECK: vpandnq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
- %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
- ret <2 x i64> %res
-}
-
-define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbkz_128
- ;CHECK: vpandnq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
- %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
- %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
- ret <2 x i64> %res
-}
-
-declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
-define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rr_256
- ;CHECK: vpandnq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrk_256
- ;CHECK: vpandnq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rrkz_256
- ;CHECK: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rm_256
- ;CHECK: vpandnq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07]
- %b = load <4 x i64>, <4 x i64>* %ptr_b
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmk_256
- ;CHECK: vpandnq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f]
- %b = load <4 x i64>, <4 x i64>* %ptr_b
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmkz_256
- ;CHECK: vpandnq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07]
- %b = load <4 x i64>, <4 x i64>* %ptr_b
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmb_256
- ;CHECK: vpandnq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
- %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbk_256
- ;CHECK: vpandnq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
- %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
- ;CHECK-LABEL: test_mask_andnot_epi64_rmbkz_256
- ;CHECK: vpandnq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07]
- %q = load i64, i64* %ptr_b
- %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
- %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
- %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
- ret <4 x i64> %res
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
- ;CHECK: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmpps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8)
define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
- ;CHECK: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmpps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8)
define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
- ;CHECK: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmppd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8)
define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
- ;CHECK: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
+; CHECK-LABEL: test_cmppd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1)
ret i8 %res
}
declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)
define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_add_ps_256
- ;CHECK: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_add_ps_256
- ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_add_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x58,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_add_ps_256
- ;CHECK: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_add_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_add_ps_128
- ;CHECK: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_add_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_add_ps_128
- ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_add_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x58,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_add_ps_128
- ;CHECK: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_add_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_sub_ps_256
- ;CHECK: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_sub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_sub_ps_256
- ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_sub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5c,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_sub_ps_256
- ;CHECK: vsubps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_sub_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_sub_ps_128
- ;CHECK: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_sub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_sub_ps_128
- ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_sub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5c,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_sub_ps_128
- ;CHECK: vsubps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_sub_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5c,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_mul_ps_256
- ;CHECK: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_mul_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_mul_ps_256
- ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_mul_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x59,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mul_ps_256
- ;CHECK: vmulps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_mul_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_mul_ps_128
- ;CHECK: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_mul_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_mul_ps_128
- ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_mul_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x59,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mul_ps_128
- ;CHECK: vmulps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_mul_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x59,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_div_ps_256
- ;CHECK: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_div_ps_256
- ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_div_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5e,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_div_ps_256
- ;CHECK: vdivps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_div_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_div_ps_128
- ;CHECK: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_div_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_div_ps_128
- ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_div_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5e,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_div_ps_128
- ;CHECK: vdivps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_div_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5e,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_max_ps_256
- ;CHECK: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_max_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_max_ps_256
- ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_max_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_max_ps_256
- ;CHECK: vmaxps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_max_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_max_ps_128
- ;CHECK: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_max_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_max_ps_128
- ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_max_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_max_ps_128
- ;CHECK: vmaxps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_max_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5f,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_min_ps_256
- ;CHECK: vminps %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_min_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_min_ps_256
- ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_min_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_min_ps_256
- ;CHECK: vminps %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_mm512_min_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_maskz_min_ps_128
- ;CHECK: vminps %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_mm512_maskz_min_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
- ;CHECK-LABEL: test_mm512_mask_min_ps_128
- ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-LABEL: test_mm512_mask_min_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
- ;CHECK-LABEL: test_mm512_min_ps_128
- ;CHECK: vminps %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_mm512_min_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5d,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
- ; CHECK-LABEL: test_sqrt_pd_256
- ; CHECK: vsqrtpd
+; CHECK-LABEL: test_sqrt_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
- ; CHECK-LABEL: test_sqrt_ps_256
- ; CHECK: vsqrtps
+; CHECK-LABEL: test_sqrt_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
@@ -2572,8 +2299,10 @@ define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
- ; CHECK-LABEL: test_getexp_pd_256
- ; CHECK: vgetexppd
+; CHECK-LABEL: test_getexp_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vgetexppd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x42,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
ret <4 x double> %res
}
@@ -2581,8 +2310,10 @@ define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
define <8 x float> @test_getexp_ps_256(<8 x float> %a0) {
- ; CHECK-LABEL: test_getexp_ps_256
- ; CHECK: vgetexpps
+; CHECK-LABEL: test_getexp_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vgetexpps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x42,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
@@ -2590,11 +2321,14 @@ declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>
declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_128
-; CHECK-NOT: call
-; CHECK: vpmaxsd %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2603,11 +2337,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_256
-; CHECK-NOT: call
-; CHECK: vpmaxsd %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3d,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2616,11 +2353,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_128
-; CHECK-NOT: call
-; CHECK: vpmaxsq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2629,11 +2369,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_256
-; CHECK-NOT: call
-; CHECK: vpmaxsq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1]
+; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2642,11 +2385,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_128
-; CHECK-NOT: call
-; CHECK: vpmaxud %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2,i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1]
+; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2655,11 +2401,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_256
-; CHECK-NOT: call
-; CHECK: vpmaxud %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1]
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3f,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2668,11 +2417,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_128
-; CHECK-NOT: call
-; CHECK: vpmaxuq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1]
+; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2681,11 +2433,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_256
-; CHECK-NOT: call
-; CHECK: vpmaxuq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1]
+; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2694,11 +2449,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_128
-; CHECK-NOT: call
-; CHECK: vpminsd %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1]
+; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2707,11 +2465,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_256
-; CHECK-NOT: call
-; CHECK: vpminsd %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1]
+; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x39,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2720,11 +2481,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_128
-; CHECK-NOT: call
-; CHECK: vpminsq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1]
+; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x39,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2733,11 +2497,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_256
-; CHECK-NOT: call
-; CHECK: vpminsq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1]
+; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2746,11 +2513,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_128
-; CHECK-NOT: call
-; CHECK: vpminud %xmm
-; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1]
+; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
%res2 = add <4 x i32> %res, %res1
@@ -2759,11 +2529,14 @@ define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %
declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_256
-; CHECK-NOT: call
-; CHECK: vpminud %ymm
-; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1]
+; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x3b,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2772,11 +2545,14 @@ define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %
declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_128
-; CHECK-NOT: call
-; CHECK: vpminuq %xmm
-; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1]
+; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2785,11 +2561,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %
declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_256
-; CHECK-NOT: call
-; CHECK: vpminuq %ymm
-; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1]
+; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
%res2 = add <4 x i64> %res, %res1
@@ -2798,12 +2577,15 @@ define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %
declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d %xmm{{.*}}{%k1}
-; CHECK-NOT: {z}
define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xca]
+; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -2812,11 +2594,15 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d %xmm{{.*}}{%k1} {z}
define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xca]
+; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -2825,12 +2611,15 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d %ymm{{.*}}{%k1}
-; CHECK-NOT: {z}
define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xca]
+; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2839,11 +2628,15 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermt2d {{.*}}{%k1} {z}
define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xda]
+; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xca]
+; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -2852,11 +2645,15 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2pd %xmm{{.*}}{%k1}
define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x77,0xda]
+; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x77,0xca]
+; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -2865,11 +2662,15 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2pd %ymm{{.*}}{%k1}
define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x77,0xda]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x77,0xca]
+; CHECK-NEXT: vaddpd %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -2878,11 +2679,15 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2ps %xmm{{.*}}{%k1}
define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd9]
+; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xda]
+; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x77,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -2891,11 +2696,15 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2ps %ymm{{.*}}{%k1}
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd9]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x77,0xda]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x77,0xca]
+; CHECK-NEXT: vaddps %ymm1, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -2904,11 +2713,14 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsq{{.*}}{%k1}
define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x1f,0xc8]
+; CHECK-NEXT: vpabsq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x1f,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -2917,11 +2729,14 @@ define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x
declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsq{{.*}}{%k1}
define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x1f,0xc8]
+; CHECK-NEXT: vpabsq %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x1f,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -2930,11 +2745,14 @@ define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x
declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsd{{.*}}{%k1}
define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1e,0xc8]
+; CHECK-NEXT: vpabsd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x1e,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -2943,25 +2761,30 @@ define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x
declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsd{{.*}}{%k1}
define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1e,0xc8]
+; CHECK-NEXT: vpabsd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x1e,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
ret <8 x i32> %res2
}
-
declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefpd{{.*}}{%k1}
define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x2c,0xd1]
+; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x2c,0xc1]
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -2970,11 +2793,14 @@ define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2
declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefpd{{.*}}{%k1}
define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x2c,0xd1]
+; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x2c,0xc1]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -2982,11 +2808,15 @@ define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4
}
declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefps{{.*}}{%k1}
+
define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2c,0xd1]
+; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2c,0xc1]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -2994,250 +2824,33 @@ define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x
}
declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefps{{.*}}{%k1}
+
define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2c,0xd1]
+; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2c,0xc1]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
ret <8 x float> %res2
}
-declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
-; CHECK: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[1],k1[1]
-; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[1],xmm1[1]
- %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
- %res2 = fadd <2 x double> %res, %res1
- ret <2 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256:
-; CHECK: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
-; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x15,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
- %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
- %res2 = fadd <4 x double> %res, %res1
- ret <4 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128:
-; CHECK: vunpckhps %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
-; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x15,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
- %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
- %res2 = fadd <4 x float> %res, %res1
- ret <4 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
-; CHECK: ## BB#0:
-; CHECK: vunpckhps %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
-; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
- %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
- %res2 = fadd <8 x float> %res, %res1
- ret <8 x float> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128:
-; CHECK: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0]
-; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x14,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0]
- %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
- %res2 = fadd <2 x double> %res, %res1
- ret <2 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256:
-; CHECK: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
-; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x14,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
- %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
- %res2 = fadd <4 x double> %res, %res1
- ret <4 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128:
-; CHECK: vunpcklps %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
-; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x14,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
- %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
- %res2 = fadd <4 x float> %res, %res1
- ret <4 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256:
-; CHECK: vunpcklps %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
-; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x14,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
- %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
- %res2 = fadd <8 x float> %res, %res1
- ret <8 x float> %res2
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128:
-; CHECK: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
-; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6a,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
- %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
- %res2 = add <4 x i32> %res, %res1
- ret <4 x i32> %res2
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128:
-; CHECK: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
-; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x62,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
- %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
- %res2 = add <4 x i32> %res, %res1
- ret <4 x i32> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256:
-; CHECK: ## BB#0:
-; CHECK: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
-; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6a,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
- %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
- %res2 = add <8 x i32> %res, %res1
- ret <8 x i32> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256:
-; CHECK: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
-; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x62,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
- %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
- %res2 = add <8 x i32> %res, %res1
- ret <8 x i32> %res2
-}
-
-declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128:
-; CHECK: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[1],k1[1]
-; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6d,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[1],xmm1[1]
- %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res2 = add <2 x i64> %res, %res1
- ret <2 x i64> %res2
-}
-
-declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128:
-; CHECK: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[0]
-; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6c,0xc1]
-; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[0]
- %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res2 = add <2 x i64> %res, %res1
- ret <2 x i64> %res2
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256:
-; CHECK: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
-; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6c,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
- %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res2 = add <4 x i64> %res, %res1
- ret <4 x i64> %res2
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256:
-; CHECK: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
-; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6d,0xc1]
-; CHECK-NEXT: ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
- %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res2 = add <4 x i64> %res, %res1
- ret <4 x i64> %res2
-}
-
declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
-; CHECK: vpmovqb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
+; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2]
+; CHECK-NEXT: vpmovqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3250,8 +2863,11 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128:
-; CHECK: vpmovqb %xmm0, (%rdi)
-; CHECK: vpmovqb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0x07]
+; CHECK-NEXT: vpmovqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3261,9 +2877,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
-; CHECK: vpmovsqb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
+; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2]
+; CHECK-NEXT: vpmovsqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3276,8 +2897,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128:
-; CHECK: vpmovsqb %xmm0, (%rdi)
-; CHECK: vpmovsqb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3287,9 +2911,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
-; CHECK: vpmovusqb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
+; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2]
+; CHECK-NEXT: vpmovusqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3302,8 +2931,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128:
-; CHECK: vpmovusqb %xmm0, (%rdi)
-; CHECK: vpmovusqb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3313,9 +2945,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256:
-; CHECK: vpmovqb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
+; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2]
+; CHECK-NEXT: vpmovqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3328,8 +2965,11 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256:
-; CHECK: vpmovqb %ymm0, (%rdi)
-; CHECK: vpmovqb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0x07]
+; CHECK-NEXT: vpmovqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3339,9 +2979,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256:
-; CHECK: vpmovsqb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
+; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2]
+; CHECK-NEXT: vpmovsqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3354,8 +2999,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256:
-; CHECK: vpmovsqb %ymm0, (%rdi)
-; CHECK: vpmovsqb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3365,9 +3013,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256:
-; CHECK: vpmovusqb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
+; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2]
+; CHECK-NEXT: vpmovusqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3380,8 +3033,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256:
-; CHECK: vpmovusqb %ymm0, (%rdi)
-; CHECK: vpmovusqb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3391,9 +3047,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
-; CHECK: vpmovqw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3406,8 +3067,11 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128:
-; CHECK: vpmovqw %xmm0, (%rdi)
-; CHECK: vpmovqw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0x07]
+; CHECK-NEXT: vpmovqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3417,9 +3081,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
-; CHECK: vpmovsqw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
+; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2]
+; CHECK-NEXT: vpmovsqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3432,8 +3101,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128:
-; CHECK: vpmovsqw %xmm0, (%rdi)
-; CHECK: vpmovsqw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3443,9 +3115,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
-; CHECK: vpmovusqw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
+; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2]
+; CHECK-NEXT: vpmovusqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3458,8 +3135,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128:
-; CHECK: vpmovusqw %xmm0, (%rdi)
-; CHECK: vpmovusqw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3469,9 +3149,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256:
-; CHECK: vpmovqw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
+; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2]
+; CHECK-NEXT: vpmovqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3484,8 +3169,11 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256:
-; CHECK: vpmovqw %ymm0, (%rdi)
-; CHECK: vpmovqw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0x07]
+; CHECK-NEXT: vpmovqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3495,9 +3183,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256:
-; CHECK: vpmovsqw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
+; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2]
+; CHECK-NEXT: vpmovsqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3510,8 +3203,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256:
-; CHECK: vpmovsqw %ymm0, (%rdi)
-; CHECK: vpmovsqw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3521,9 +3217,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256:
-; CHECK: vpmovusqw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
+; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2]
+; CHECK-NEXT: vpmovusqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3536,8 +3237,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256:
-; CHECK: vpmovusqw %ymm0, (%rdi)
-; CHECK: vpmovusqw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3547,9 +3251,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
-; CHECK: vpmovqd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
+; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc2]
+; CHECK-NEXT: vpmovqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3562,8 +3271,11 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128:
-; CHECK: vpmovqd %xmm0, (%rdi)
-; CHECK: vpmovqd %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0x07]
+; CHECK-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3573,9 +3285,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
-; CHECK: vpmovsqd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
+; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc2]
+; CHECK-NEXT: vpmovsqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3588,8 +3305,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128:
-; CHECK: vpmovsqd %xmm0, (%rdi)
-; CHECK: vpmovsqd %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3599,9 +3319,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
-; CHECK: vpmovusqd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
+; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc2]
+; CHECK-NEXT: vpmovusqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3614,8 +3339,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128:
-; CHECK: vpmovusqd %xmm0, (%rdi)
-; CHECK: vpmovusqd %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
ret void
@@ -3625,9 +3353,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256:
-; CHECK: vpmovqd %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovqd %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1]
+; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc2]
+; CHECK-NEXT: vpmovqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3640,8 +3373,11 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256:
-; CHECK: vpmovqd %ymm0, (%rdi)
-; CHECK: vpmovqd %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0x07]
+; CHECK-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3651,9 +3387,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256:
-; CHECK: vpmovsqd %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsqd %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1]
+; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2]
+; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3666,8 +3407,11 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256:
-; CHECK: vpmovsqd %ymm0, (%rdi)
-; CHECK: vpmovsqd %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3677,9 +3421,14 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256:
-; CHECK: vpmovusqd %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusqd %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1]
+; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2]
+; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0]
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
@@ -3692,8 +3441,11 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256:
-; CHECK: vpmovusqd %ymm0, (%rdi)
-; CHECK: vpmovusqd %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
ret void
@@ -3703,9 +3455,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128:
-; CHECK: vpmovdb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
+; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2]
+; CHECK-NEXT: vpmovdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3718,8 +3475,11 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128:
-; CHECK: vpmovdb %xmm0, (%rdi)
-; CHECK: vpmovdb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0x07]
+; CHECK-NEXT: vpmovdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3729,9 +3489,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
-; CHECK: vpmovsdb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
+; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2]
+; CHECK-NEXT: vpmovsdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3744,8 +3509,11 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128:
-; CHECK: vpmovsdb %xmm0, (%rdi)
-; CHECK: vpmovsdb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3755,9 +3523,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
-; CHECK: vpmovusdb %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdb %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
+; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2]
+; CHECK-NEXT: vpmovusdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3770,8 +3543,11 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128:
-; CHECK: vpmovusdb %xmm0, (%rdi)
-; CHECK: vpmovusdb %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3781,9 +3557,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256:
-; CHECK: vpmovdb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
+; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2]
+; CHECK-NEXT: vpmovdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3796,8 +3577,11 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256:
-; CHECK: vpmovdb %ymm0, (%rdi)
-; CHECK: vpmovdb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0x07]
+; CHECK-NEXT: vpmovdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3807,9 +3591,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256:
-; CHECK: vpmovsdb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
+; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2]
+; CHECK-NEXT: vpmovsdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3822,8 +3611,11 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256:
-; CHECK: vpmovsdb %ymm0, (%rdi)
-; CHECK: vpmovsdb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3833,9 +3625,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256:
-; CHECK: vpmovusdb %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdb %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
+; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2]
+; CHECK-NEXT: vpmovusdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0]
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
%res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
@@ -3848,8 +3645,11 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256:
-; CHECK: vpmovusdb %ymm0, (%rdi)
-; CHECK: vpmovusdb %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3859,9 +3659,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
-; CHECK: vpmovdw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
+; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2]
+; CHECK-NEXT: vpmovdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3874,8 +3679,11 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128:
-; CHECK: vpmovdw %xmm0, (%rdi)
-; CHECK: vpmovdw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
+; CHECK-NEXT: vpmovdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3885,9 +3693,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
-; CHECK: vpmovsdw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
+; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2]
+; CHECK-NEXT: vpmovsdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3900,8 +3713,11 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128:
-; CHECK: vpmovsdw %xmm0, (%rdi)
-; CHECK: vpmovsdw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3911,9 +3727,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
-; CHECK: vpmovusdw %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdw %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
+; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2]
+; CHECK-NEXT: vpmovusdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3926,8 +3747,11 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128:
-; CHECK: vpmovusdw %xmm0, (%rdi)
-; CHECK: vpmovusdw %xmm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
ret void
@@ -3937,9 +3761,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256:
-; CHECK: vpmovdw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovdw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
+; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2]
+; CHECK-NEXT: vpmovdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3952,8 +3781,11 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256:
-; CHECK: vpmovdw %ymm0, (%rdi)
-; CHECK: vpmovdw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0x07]
+; CHECK-NEXT: vpmovdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3963,9 +3795,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256:
-; CHECK: vpmovsdw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsdw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
+; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2]
+; CHECK-NEXT: vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -3978,8 +3815,11 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256:
-; CHECK: vpmovsdw %ymm0, (%rdi)
-; CHECK: vpmovsdw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -3989,9 +3829,14 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256:
-; CHECK: vpmovusdw %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovusdw %ymm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
+; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2]
+; CHECK-NEXT: vpmovusdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0]
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
%res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -4004,8 +3849,11 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256:
-; CHECK: vpmovusdw %ymm0, (%rdi)
-; CHECK: vpmovusdw %ymm0, (%rdi) {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0x07]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
ret void
@@ -4016,12 +3864,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>,
define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4033,12 +3880,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32>, <4 x double>,
define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 ## encoding: [0xc5,0xfe,0xe6,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4050,12 +3896,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5b,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4067,12 +3912,11 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32>, <8 x float>, i
define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5b,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
%res1 = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4084,12 +3928,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4101,12 +3944,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x28,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4118,12 +3960,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double>, <4 x float>
define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8]
+; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x5a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4135,12 +3976,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8
define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8]
+; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x5a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4152,12 +3992,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4169,12 +4008,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4186,12 +4024,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4203,12 +4040,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4220,12 +4056,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float>, <2 x double
define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8]
+; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x5a,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4237,12 +4072,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float>, <4 x double
define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8]
+; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x5a,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4254,12 +4088,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8]
+; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4271,12 +4104,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8]
+; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4288,12 +4120,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8]
+; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4305,12 +4136,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8]
+; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xe6,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4322,12 +4152,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4339,12 +4168,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4356,12 +4184,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8]
+; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4373,12 +4200,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8]
+; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x5b,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4390,12 +4216,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8]
+; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -4407,12 +4232,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8]
+; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4424,12 +4248,11 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
%res1 = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4441,12 +4264,11 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32>, <4 x double>
define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
%res1 = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4458,12 +4280,11 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
%res1 = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4475,12 +4296,11 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8]
+; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
%res1 = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4488,12 +4308,15 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x f
}
declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscalepd {{.*}}{%k1}
-; CHECK: vrndscalepd
+
define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscalepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x09,0xc8,0x04]
+; CHECK-NEXT: vrndscalepd $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x09,0xc0,0x58]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1)
%res2 = fadd <2 x double> %res, %res1
@@ -4501,12 +4324,15 @@ define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <
}
declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscalepd {{.*}}{%k1}
-; CHECK: vrndscalepd
+
define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscalepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x09,0xc8,0x04]
+; CHECK-NEXT: vrndscalepd $88, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x09,0xc0,0x58]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4514,12 +4340,15 @@ define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <
}
declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscaleps {{.*}}{%k1}
-; CHECK: vrndscaleps
+
define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscaleps $88, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x08,0xc8,0x58]
+; CHECK-NEXT: vrndscaleps $4, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x04]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4528,12 +4357,14 @@ define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4
declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
-; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscaleps {{.*}}{%k1}
-; CHECK: vrndscaleps
define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrndscaleps $5, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x08,0xc8,0x05]
+; CHECK-NEXT: vrndscaleps $66, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x08,0xc0,0x42]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4545,17 +4376,16 @@ declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: ## ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd9,0x16]
+; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
%res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4)
@@ -4569,17 +4399,16 @@ declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: ## ymm3 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd9,0x16]
+; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
%res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4)
@@ -4593,14 +4422,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32
define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4612,14 +4440,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32
define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -4631,13 +4458,13 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4
define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc2,0x01]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc0,0x01]
+; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3)
%res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1)
@@ -4651,14 +4478,13 @@ declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2
define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x26,0xd0,0x0b]
+; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3)
%res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> zeroinitializer, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 -1)
@@ -4672,12 +4498,11 @@ declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4
define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4689,12 +4514,11 @@ declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x
define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4706,12 +4530,11 @@ declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x
define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x26,0xc8,0x0b]
+; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x26,0xc0,0x0b]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4723,17 +4546,16 @@ declare <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double>, <2 x double
define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[0],k1[1]
-; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: ## xmm3 = k1[0],xmm0[1]
-; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[0],xmm1[1]
+; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xd9,0x16]
+; CHECK-NEXT: ## xmm3 {%k1} {z} = xmm0[0],xmm1[1]
+; CHECK-NEXT: vshufpd $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc6,0xc1,0x16]
; CHECK-NEXT: ## xmm0 = xmm0[0],xmm1[1]
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 %x4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 -1)
%res2 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> zeroinitializer, i8 %x4)
@@ -4747,14 +4569,13 @@ declare <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double>, <4 x double
define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[0],k1[1],ymm2[3],k1[2]
-; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
+; CHECK-NEXT: vshufpd $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc6,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
%res2 = fadd <4 x double> %res, %res1
@@ -4766,14 +4587,13 @@ declare <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: ## xmm2 = xmm2[2,1],k1[1,0]
-; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## xmm2 {%k1} = xmm0[2,1],xmm1[1,0]
+; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc6,0xc1,0x16]
; CHECK-NEXT: ## xmm0 = xmm0[2,1],xmm1[1,0]
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1)
%res2 = fadd <4 x float> %res, %res1
@@ -4785,14 +4605,13 @@ declare <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: ## ymm2 = ymm2[2,1],k1[1,0],ymm2[6,5],k1[5,4]
-; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16]
+; CHECK-NEXT: ## ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4]
+; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc6,0xc1,0x16]
; CHECK-NEXT: ## ymm0 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4]
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
%res2 = fadd <8 x float> %res, %res1
@@ -4804,14 +4623,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32>, <4 x i32>, i32,
define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x16]
+; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xd9,0x16]
+; CHECK-NEXT: valignd $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
@@ -4825,12 +4643,11 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32,
define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x16]
+; CHECK-NEXT: valignd $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -4842,12 +4659,11 @@ declare <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64>, <2 x i64>, i32,
define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x16]
+; CHECK-NEXT: valignq $22, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -4859,127 +4675,29 @@ declare <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64>, <4 x i64>, i32,
define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x16]
+; CHECK-NEXT: valignq $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xc1,0x16]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
%res2 = add <4 x i64> %res, %res1
ret <4 x i64> %res2
}
-declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm1[0,1,3,2]
-; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = k1[0,1,3,2]
-; CHECK-NEXT: vpermilpd $22, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,3,2]
-; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm1[1,0]
-; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = k1[1,0]
-; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[1,0]
-; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1)
- %res3 = fadd <2 x double> %res, %res1
- %res4 = fadd <2 x double> %res3, %res2
- ret <2 x double> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm1[2,1,1,0,6,5,5,4]
-; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = k1[2,1,1,0,6,5,5,4]
-; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[2,1,1,0,6,5,5,4]
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res3, %res2
- ret <8 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm1[2,1,1,0]
-; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = k1[2,1,1,0]
-; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[2,1,1,0]
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-
declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1]
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xd9]
+; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x0d,0xc1]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
%res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
@@ -4993,14 +4711,13 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1]
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xd9]
+; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x0d,0xc1]
+; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0x58,0xcb]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3)
%res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
@@ -5014,14 +4731,13 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i3
define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1]
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xd9]
+; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x0c,0xc1]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
%res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@@ -5035,14 +4751,13 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i3
define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1]
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xd9]
+; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x0c,0xc1]
+; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6c,0x08,0x58,0xcb]
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3)
%res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
@@ -5056,14 +4771,13 @@ declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x floa
define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xd9,0x01]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x18,0xc1,0x01]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
%res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4)
@@ -5077,14 +4791,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i3
define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xd9,0x01]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x38,0xc1,0x01]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1)
@@ -5099,13 +4812,12 @@ declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -5117,13 +4829,12 @@ declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x65,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
%res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
%res2 = add <4 x i32> %res, %res1
@@ -5135,13 +4846,12 @@ declare <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -5153,13 +4863,12 @@ declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
%res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
%res2 = add <8 x i32> %res, %res1
@@ -5171,13 +4880,12 @@ declare <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -5189,13 +4897,12 @@ declare <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
%res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
%res2 = add <2 x i64> %res, %res1
@@ -5207,13 +4914,12 @@ declare <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
%res2 = add <4 x i64> %res, %res1
@@ -5225,121 +4931,45 @@ declare <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps %zmm0, %zmm3
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xda,0x21]
+; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0xe5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
%res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
%res2 = add <4 x i64> %res, %res1
ret <4 x i64> %res2
}
-declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 -1)
- %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask)
- %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
- %res3 = add <8 x i32> %res, %res1
- %res4 = add <8 x i32> %res2, %res3
- ret <8 x i32> %res4
-}
-
-declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
- %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask)
- %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask)
- %res3 = add <4 x i32> %res, %res1
- %res4 = add <4 x i32> %res2, %res3
- ret <4 x i32> %res4
-}
-
-declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
- %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask)
- %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask)
- %res3 = add <4 x i64> %res, %res1
- %res4 = add <4 x i64> %res2, %res3
- ret <4 x i64> %res4
-}
-
-declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
- %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask)
- %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask)
- %res3 = add <2 x i64> %res, %res1
- %res4 = add <2 x i64> %res2, %res3
- ret <2 x i64> %res4
-}
-
define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
- ; CHECK: test_x86_vcvtph2ps_128
- ; CHECK: vcvtph2ps %xmm0, %xmm0
+; CHECK-LABEL: test_x86_vcvtph2ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_128_rrk
- ; CHECK: vcvtph2ps %xmm0, %xmm1 {%k1}
+; CHECK-LABEL: test_x86_vcvtph2ps_128_rrk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_128_rrkz
- ; CHECK: vcvtph2ps %xmm0, %xmm0 {%k1} {z}
+; CHECK-LABEL: test_x86_vcvtph2ps_128_rrkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
@@ -5347,228 +4977,133 @@ define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly
define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
- ; CHECK: test_x86_vcvtph2ps_256
- ; CHECK: vcvtph2ps %xmm0, %ymm0
+; CHECK-LABEL: test_x86_vcvtph2ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_256_rrk
- ; CHECK: vcvtph2ps %xmm0, %ymm1 {%k1}
+; CHECK-LABEL: test_x86_vcvtph2ps_256_rrk:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) {
- ; CHECK: test_x86_vcvtph2ps_256_rrkz
- ; CHECK: vcvtph2ps %xmm0, %ymm0 {%k1} {z}
+; CHECK-LABEL: test_x86_vcvtph2ps_256_rrkz:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly
-define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
- ; CHECK: test_x86_vcvtps2ph_128
- ; CHECK: vcvtps2ph $2, %xmm0, %xmm0
- %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) {
+; CHECK-LABEL: test_x86_vcvtps2ph_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1d,0xc0,0x02]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res0 = add <8 x i16> %res1, %res2
+ %res = add <8 x i16> %res3, %res0
ret <8 x i16> %res
}
-
declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly
-define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
- ; CHECK: test_x86_vcvtps2ph_256
- ; CHECK: vcvtps2ph $2, %ymm0, %xmm0
- %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %src) {
+; CHECK-LABEL: test_x86_vcvtps2ph_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02]
+; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1d,0xc0,0x02]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res0 = add <8 x i16> %res1, %res2
+ %res = add <8 x i16> %res3, %res0
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly
-declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2]
-; CHECK-NEXT: vmovsldup %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2]
-; CHECK-NEXT: vmovsldup %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2]
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovsldup %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vmovsldup %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT: vmovshdup %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT: vmovshdup %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vmovshdup %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vmovshdup %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: ## xmm1 = xmm0[0,0]
-; CHECK-NEXT: vmovddup %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: ## xmm2 = xmm0[0,0]
-; CHECK-NEXT: vmovddup %xmm0, %xmm0
-; CHECK-NEXT: ## xmm0 = xmm0[0,0]
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2)
- %res3 = fadd <2 x double> %res, %res1
- %res4 = fadd <2 x double> %res2, %res3
- ret <2 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2]
-; CHECK-NEXT: vmovddup %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2]
-; CHECK-NEXT: vmovddup %ymm0, %ymm0
-; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2]
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-
define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) {
; CHECK-LABEL: test_rsqrt_ps_256_rr:
-; CHECK: vrsqrt14ps %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_256_rrkz:
-; CHECK: vrsqrt14ps %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_256_rrk:
-; CHECK: vrsqrt14ps %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
ret <8 x float> %res
}
define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) {
; CHECK-LABEL: test_rsqrt_ps_128_rr:
-; CHECK: vrsqrt14ps %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_128_rrkz:
-; CHECK: vrsqrt14ps %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_128_rrk:
-; CHECK: vrsqrt14ps %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
ret <4 x float> %res
}
@@ -5578,42 +5113,60 @@ declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8
define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_256_rr:
-; CHECK: vrcp14ps %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
ret <8 x float> %res
}
define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_256_rrkz:
-; CHECK: vrcp14ps %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
ret <8 x float> %res
}
define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_256_rrk:
-; CHECK: vrcp14ps %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
ret <8 x float> %res
}
define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_128_rr:
-; CHECK: vrcp14ps %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
ret <4 x float> %res
}
define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_128_rrkz:
-; CHECK: vrcp14ps %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
ret <4 x float> %res
}
define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_128_rrk:
-; CHECK: vrcp14ps %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
ret <4 x float> %res
}
@@ -5621,45 +5174,62 @@ define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %ma
declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
-
define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) {
; CHECK-LABEL: test_rsqrt_pd_256_rr:
-; CHECK: vrsqrt14pd %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
ret <4 x double> %res
}
define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_256_rrkz:
-; CHECK: vrsqrt14pd %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
ret <4 x double> %res
}
define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_256_rrk:
-; CHECK: vrsqrt14pd %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
ret <4 x double> %res
}
define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) {
; CHECK-LABEL: test_rsqrt_pd_128_rr:
-; CHECK: vrsqrt14pd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
ret <2 x double> %res
}
define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_128_rrkz:
-; CHECK: vrsqrt14pd %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4e,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
ret <2 x double> %res
}
define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_128_rrk:
-; CHECK: vrsqrt14pd %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4e,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
ret <2 x double> %res
}
@@ -5669,42 +5239,60 @@ declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>,
define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) {
; CHECK-LABEL: test_rcp_pd_256_rr:
-; CHECK: vrcp14pd %ymm0, %ymm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
ret <4 x double> %res
}
define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_256_rrkz:
-; CHECK: vrcp14pd %ymm0, %ymm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
ret <4 x double> %res
}
define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_256_rrk:
-; CHECK: vrcp14pd %ymm0, %ymm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
ret <4 x double> %res
}
define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) {
; CHECK-LABEL: test_rcp_pd_128_rr:
-; CHECK: vrcp14pd %xmm0, %xmm0
+; CHECK: ## BB#0:
+; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
ret <2 x double> %res
}
define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_128_rrkz:
-; CHECK: vrcp14pd %xmm0, %xmm0 {%k1} {z}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4c,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
ret <2 x double> %res
}
define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_128_rrk:
-; CHECK: vrcp14pd %xmm0, %xmm1 {%k1}
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vrcp14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4c,0xc8]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
ret <2 x double> %res
}
@@ -5712,69 +5300,22 @@ define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8
declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
-define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-
- %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly
-
-define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
-
- %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly
-
-define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128:
-; CHECK: kmovw %eax, %k1
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
-
- %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
-
-
declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8)
define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vaddps %ymm1, %ymm0, %ymm0
-; CHECK: vaddps %ymm0, %ymm2, %ymm0
-
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
%res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask)
%res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask)
@@ -5787,13 +5328,18 @@ declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>,
define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
-; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
-; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm0
-; CHECK: vpaddd %ymm1, %ymm0, %ymm0
-; CHECK: vpaddd %ymm0, %ymm2, %ymm0
-
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x43,0xd0,0x00]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xc8,0x00]
+; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask)
%res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
@@ -5807,14 +5353,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i6
define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xd9]
+; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xd3,0xc1]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
@@ -5828,14 +5373,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xd9]
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd3,0xc1]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xd4,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
@@ -5844,43 +5388,41 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x
ret <4 x i64> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> %x2, i8 -1)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i8 255, <2 x i64> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x73,0xd0,0xff]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 255, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 255, <2 x i64> %x2, i8 -1)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 255, <2 x i64> zeroinitializer, i8 %x3)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res2, %res3
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> %x2, i8 -1)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i8 255, <4 x i64> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x73,0xd0,0xff]
+; CHECK-NEXT: vpsrlq $255, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x73,0xd0,0xff]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 255, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 255, <4 x i64> %x2, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 255, <4 x i64> zeroinitializer, i8 %x3)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res2, %res3
ret <4 x i64> %res4
@@ -5889,14 +5431,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xd9]
+; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xd2,0xc1]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
@@ -5910,14 +5451,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xd9]
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xd2,0xc1]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x65,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
@@ -5926,63 +5466,61 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x
ret <8 x i32> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld $255, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrld $255, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrld $255, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> %x2, i8 -1)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i8 255, <4 x i32> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xd0,0xff]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 255, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 255, <4 x i32> %x2, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 255, <4 x i32> zeroinitializer, i8 %x3)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res2, %res3
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrld $255, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrld $255, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrld $255, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> %x2, i8 -1)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i8 255, <8 x i32> zeroinitializer, i8 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xd0,0xff]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 255, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 255, <8 x i32> %x2, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 255, <8 x i32> zeroinitializer, i8 %x3)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res2, %res3
ret <8 x i32> %res4
}
-declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i8, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpsrld $255, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpsrld $255, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vpsrld $255, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> %x2, i16 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i8 255, <16 x i32> zeroinitializer, i16 %x3)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xc9,0x72,0xd0,0xff]
+; CHECK-NEXT: vpsrld $255, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xd0,0xff]
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 255, <16 x i32> zeroinitializer, i16 %x3)
%res3 = add <16 x i32> %res, %res1
%res4 = add <16 x i32> %res2, %res3
ret <16 x i32> %res4
@@ -5993,14 +5531,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64>, <2 x i64>, <2 x i64
define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x45,0xd1]
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x45,0xd9]
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x45,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6014,14 +5551,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64>, <4 x i64>, <4 x i64
define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x45,0xd1]
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x45,0xd9]
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x45,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6035,14 +5571,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32>, <4 x i32>, <4 x i32
define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x45,0xd1]
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x45,0xd9]
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x45,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6056,14 +5591,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x45,0xd1]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x45,0xd9]
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x45,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6077,14 +5611,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe2,0xd1]
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe2,0xd9]
+; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xe2,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6098,14 +5631,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe2,0xd1]
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe2,0xd9]
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xe2,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6114,43 +5646,41 @@ define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x
ret <8 x i32> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsrad $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsrad $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psra.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsrad $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsrad $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psra.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
@@ -6161,14 +5691,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i6
define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe2,0xd1]
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xe2,0xd9]
+; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe2,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6182,14 +5711,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe2,0xd1]
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xe2,0xd9]
+; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xe2,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6198,43 +5726,41 @@ define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x
ret <4 x i64> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xe0,0x03]
+; CHECK-NEXT: vpsraq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xe0,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
@@ -6246,14 +5772,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf2,0xd1]
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf2,0xd9]
+; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xf2,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6267,14 +5792,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf2,0xd1]
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf2,0xd9]
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xf2,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6283,43 +5807,41 @@ define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x
ret <8 x i32> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpslld $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xf0,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xf0,0x03]
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xf0,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
@@ -6330,14 +5852,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf3,0xd1]
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf3,0xd9]
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xf3,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6346,213 +5867,58 @@ define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x
ret <4 x i64> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpsllq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpsllq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x73,0xf0,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.psll.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vpsllq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x73,0xf0,0x03]
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x73,0xf0,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.psll.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
}
-define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps (%rdi), %ymm0
-; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x float> %res2, %res1
- ret <8 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8)
-
-define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovups (%rdi), %ymm0
-; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x float> %res2, %res1
- ret <8 x float> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8)
-
-define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovapd (%rdi), %ymm0
-; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x double> %res2, %res1
- ret <4 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8)
-
-define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovupd (%rdi), %ymm0
-; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1}
-; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x double> %res2, %res1
- ret <4 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8)
-
-define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_ps_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovaps (%rdi), %xmm0
-; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x float> %res2, %res1
- ret <4 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8)
-
-define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_ps_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovups (%rdi), %xmm0
-; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x float> %res2, %res1
- ret <4 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8)
-
-define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_pd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovapd (%rdi), %xmm0
-; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <2 x double> %res2, %res1
- ret <2 x double> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8)
-
-define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_pd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovupd (%rdi), %xmm0
-; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1}
-; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z}
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <2 x double> %res2, %res1
- ret <2 x double> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8*, <2 x double>, i8)
-
declare <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav4_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x46,0xd1]
+; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x46,0xd9]
+; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x46,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6566,14 +5932,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x46,0xd1]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x46,0xd9]
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x46,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6582,19 +5947,31 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1
ret <8 x i32> %res4
}
+define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si_const:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; CHECK-NEXT: ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI371_0-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x46,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI371_1-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>, <8 x i32> zeroinitializer, i8 -1)
+ ret <8 x i32> %res
+}
+
declare <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x46,0xd1]
+; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x46,0xd9]
+; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6603,19 +5980,31 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %
ret <2 x i64> %res4
}
+define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128_const:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} xmm0 = [2,18446744073709551607]
+; CHECK-NEXT: ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI373_0-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A]
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI373_1-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> <i64 2, i64 -9>, <2 x i64> <i64 1, i64 90>, <2 x i64> zeroinitializer, i8 -1)
+ ret <2 x i64> %res
+}
+
declare <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x46,0xd1]
+; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x46,0xd9]
+; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x46,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6629,14 +6018,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64>, <2 x i64>, <2 x i64
define <2 x i64>@test_int_x86_avx512_mask_psllv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv2_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x47,0xd1]
+; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x47,0xd9]
+; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x47,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6650,14 +6038,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64>, <4 x i64>, <4 x i64
define <4 x i64>@test_int_x86_avx512_mask_psllv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_di:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x47,0xd1]
+; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x47,0xd9]
+; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x47,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6671,14 +6058,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32>, <4 x i32>, <4 x i32
define <4 x i32>@test_int_x86_avx512_mask_psllv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x47,0xd1]
+; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x47,0xd9]
+; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x47,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6692,14 +6078,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_si:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x47,0xd1]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x47,0xd9]
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x47,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6713,14 +6098,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x14,0xd1]
+; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x14,0xd9]
+; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6734,14 +6118,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x14,0xd1]
+; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x14,0xd9]
+; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6755,14 +6138,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x14,0xd1]
+; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x14,0xd9]
+; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6776,14 +6158,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x14,0xd1]
+; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x14,0xd9]
+; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6791,85 +6172,82 @@ define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i8, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i32, <8 x i32>, i8)
-define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
+define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprold $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
+; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprolq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
+; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
@@ -6880,14 +6258,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x15,0xd1]
+; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x15,0xd9]
+; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xc1]
+; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6901,14 +6278,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x15,0xd1]
+; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x15,0xd9]
+; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xc1]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6922,14 +6298,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x15,0xd1]
+; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x15,0xd9]
+; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xc1]
+; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6943,14 +6318,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1}
-; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z}
-; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x15,0xd1]
+; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x15,0xd9]
+; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xc1]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6959,85 +6333,81 @@ define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %
ret <4 x i64> %res4
}
-declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i8, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i32, <4 x i32>, i8)
-define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
+define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprord $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1)
%res3 = add <4 x i32> %res, %res1
%res4 = add <4 x i32> %res3, %res2
ret <4 x i32> %res4
}
-declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i8, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i32, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprord $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
+; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1)
%res3 = add <8 x i32> %res, %res1
%res4 = add <8 x i32> %res3, %res2
ret <8 x i32> %res4
}
-declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i8, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i32, <2 x i64>, i8)
-define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
+define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vprorq $3, %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 -1)
%res3 = add <2 x i64> %res, %res1
%res4 = add <2 x i64> %res3, %res2
ret <2 x i64> %res4
}
-declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i8, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i32, <4 x i64>, i8)
-define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
+define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %sil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1}
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vprorq $3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
+; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1)
%res3 = add <4 x i64> %res, %res1
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
@@ -7048,14 +6418,16 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x31,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x31,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
@@ -7069,14 +6441,16 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x31,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
@@ -7090,14 +6464,16 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x32,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x32,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
@@ -7111,14 +6487,16 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x32,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
@@ -7132,14 +6510,16 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x35,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x35,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1)
@@ -7153,14 +6533,16 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x35,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1)
@@ -7174,14 +6556,16 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x33,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x33,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
@@ -7195,14 +6579,16 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x33,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
@@ -7216,14 +6602,16 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8]
+; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x34,0xd0]
+; CHECK-NEXT: ## xmm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x34,0xc0]
+; CHECK-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
@@ -7237,14 +6625,16 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8]
+; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xd0]
+; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x34,0xc0]
+; CHECK-NEXT: ## ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
@@ -7258,14 +6648,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x21,0xd0]
+; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x21,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
@@ -7279,14 +6668,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8]
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xd0]
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x21,0xc0]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
@@ -7300,14 +6688,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8]
+; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x22,0xd0]
+; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x22,0xc0]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
@@ -7321,14 +6708,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8]
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xd0]
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x22,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
@@ -7342,14 +6728,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8]
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x23,0xd0]
+; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x23,0xc0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
%res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
@@ -7363,14 +6748,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
-; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8]
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xd0]
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x23,0xc0]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
%res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
@@ -7384,14 +6768,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
-; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8]
+; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x24,0xd0]
+; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x24,0xc0]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xca]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
%res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
@@ -7405,14 +6788,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
-; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: retq
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8]
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x24,0xd0]
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x24,0xc0]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xca]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
%res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
@@ -7420,3 +6802,494 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64>
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
}
+
+declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0]
+; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xd8]
+; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0]
+; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xd8]
+; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x36,0xc0]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x16,0xd8]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0x16,0xc0]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6c,0x28,0x58,0xcb]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x36,0xd8]
+; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0x75,0x28,0x36,0xc0]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
+; CHECK-NEXT: vpxord %xmm4, %xmm4, %xmm4 ## encoding: [0x62,0xf1,0x5d,0x08,0xef,0xe4]
+; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
+; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xcc]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 3, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xda,0x05]
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0xe5,0x08,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4)
+ %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 3, i8 %x4)
+ ;%res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ ;%res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res3
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
+; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xcc]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xda,0x05]
+; CHECK-NEXT: vpxord %ymm4, %ymm4, %ymm4 ## encoding: [0x62,0xf1,0x5d,0x28,0xef,0xe4]
+; CHECK-NEXT: vmovaps %ymm0, %ymm5 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe8]
+; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
+; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
+; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## encoding: [0x62,0xf1,0xe5,0x28,0x58,0xcd]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
+ %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
+ %res2 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res3, %res2
+ ret <4 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %xmm0, %xmm4 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ## encoding: [0x62,0xf1,0x64,0x08,0x58,0xc0]
+; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res3, %res2
+ ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm4, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xd8]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xda,0x05]
+; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0xe0]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05]
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xc2,0x05]
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## encoding: [0x62,0xf1,0x64,0x28,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm4, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc4]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+ %res1 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4)
+ %res2 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8)
+
+define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32>, <8 x i32>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xd7]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xd7]
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0x6d,0x08,0xfe,0xc0]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x75,0x08,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xd7]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0xed,0x28,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0xf5,0x28,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd7]
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## encoding: [0x62,0xf1,0xed,0x08,0xd4,0xc0]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf1,0xf5,0x08,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
diff --git a/test/CodeGen/X86/avx512vl-logic.ll b/test/CodeGen/X86/avx512vl-logic.ll
index 02cb8f978656..d6e1a7dd5391 100644
--- a/test/CodeGen/X86/avx512vl-logic.ll
+++ b/test/CodeGen/X86/avx512vl-logic.ll
@@ -13,6 +13,18 @@ entry:
ret <8 x i32> %x
}
+; CHECK-LABEL: vpandnd256
+; CHECK: vpandnd %ymm
+; CHECK: ret
+define <8 x i32> @vpandnd256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %b2 = xor <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %x = and <8 x i32> %a2, %b2
+ ret <8 x i32> %x
+}
+
; CHECK-LABEL: vpord256
; CHECK: vpord %ymm
; CHECK: ret
@@ -46,6 +58,18 @@ entry:
ret <4 x i64> %x
}
+; CHECK-LABEL: vpandnq256
+; CHECK: vpandnq %ymm
+; CHECK: ret
+define <4 x i64> @vpandnq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+ %b2 = xor <4 x i64> %b, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %x = and <4 x i64> %a2, %b2
+ ret <4 x i64> %x
+}
+
; CHECK-LABEL: vporq256
; CHECK: vporq %ymm
; CHECK: ret
@@ -81,6 +105,18 @@ entry:
ret <4 x i32> %x
}
+; CHECK-LABEL: vpandnd128
+; CHECK: vpandnd %xmm
+; CHECK: ret
+define <4 x i32> @vpandnd128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1>
+ %b2 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %x = and <4 x i32> %a2, %b2
+ ret <4 x i32> %x
+}
+
; CHECK-LABEL: vpord128
; CHECK: vpord %xmm
; CHECK: ret
@@ -114,6 +150,18 @@ entry:
ret <2 x i64> %x
}
+; CHECK-LABEL: vpandnq128
+; CHECK: vpandnq %xmm
+; CHECK: ret
+define <2 x i64> @vpandnq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <2 x i64> %a, <i64 1, i64 1>
+ %b2 = xor <2 x i64> %b, <i64 -1, i64 -1>
+ %x = and <2 x i64> %a2, %b2
+ ret <2 x i64> %x
+}
+
; CHECK-LABEL: vporq128
; CHECK: vporq %xmm
; CHECK: ret
diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
index 18fa0a142a2d..0838fb5c0439 100644
--- a/test/CodeGen/X86/avx512vl-mov.ll
+++ b/test/CodeGen/X86/avx512vl-mov.ll
@@ -1,153 +1,173 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
-; CHECK-LABEL: test_256_1
-; CHECK: vmovdqu32
-; CHECK: ret
define <8 x i32> @test_256_1(i8 * %addr) {
+; CHECK-LABEL: test_256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7e,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
%res = load <8 x i32>, <8 x i32>* %vaddr, align 1
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_2
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test_256_2(i8 * %addr) {
+; CHECK-LABEL: test_256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
%res = load <8 x i32>, <8 x i32>* %vaddr, align 32
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_3
-; CHECK: vmovdqa64
-; CHECK: ret
define void @test_256_3(i8 * %addr, <4 x i64> %data) {
+; CHECK-LABEL: test_256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
store <4 x i64>%data, <4 x i64>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_4
-; CHECK: vmovdqu32
-; CHECK: ret
define void @test_256_4(i8 * %addr, <8 x i32> %data) {
+; CHECK-LABEL: test_256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
store <8 x i32>%data, <8 x i32>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_5
-; CHECK: vmovdqa32
-; CHECK: ret
define void @test_256_5(i8 * %addr, <8 x i32> %data) {
+; CHECK-LABEL: test_256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
store <8 x i32>%data, <8 x i32>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_6
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test_256_6(i8 * %addr) {
+; CHECK-LABEL: test_256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
%res = load <4 x i64>, <4 x i64>* %vaddr, align 32
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_7
-; CHECK: vmovdqu64
-; CHECK: ret
define void @test_256_7(i8 * %addr, <4 x i64> %data) {
+; CHECK-LABEL: test_256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x28,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
store <4 x i64>%data, <4 x i64>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_8
-; CHECK: vmovdqu64
-; CHECK: ret
define <4 x i64> @test_256_8(i8 * %addr) {
+; CHECK-LABEL: test_256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
%res = load <4 x i64>, <4 x i64>* %vaddr, align 1
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_9
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_9(i8 * %addr, <4 x double> %data) {
+; CHECK-LABEL: test_256_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x28,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
store <4 x double>%data, <4 x double>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_10
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x double> @test_256_10(i8 * %addr) {
+; CHECK-LABEL: test_256_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
%res = load <4 x double>, <4 x double>* %vaddr, align 32
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_11
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_11(i8 * %addr, <8 x float> %data) {
+; CHECK-LABEL: test_256_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x28,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
store <8 x float>%data, <8 x float>* %vaddr, align 32
ret void
}
-; CHECK-LABEL: test_256_12
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define <8 x float> @test_256_12(i8 * %addr) {
+; CHECK-LABEL: test_256_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
%res = load <8 x float>, <8 x float>* %vaddr, align 32
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_13
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_13(i8 * %addr, <4 x double> %data) {
+; CHECK-LABEL: test_256_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %ymm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x28,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
store <4 x double>%data, <4 x double>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_14
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x double> @test_256_14(i8 * %addr) {
+; CHECK-LABEL: test_256_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
%res = load <4 x double>, <4 x double>* %vaddr, align 1
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_15
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_256_15(i8 * %addr, <8 x float> %data) {
+; CHECK-LABEL: test_256_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups %ymm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x28,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
store <8 x float>%data, <8 x float>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_256_16
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define <8 x float> @test_256_16(i8 * %addr) {
+; CHECK-LABEL: test_256_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
%res = load <8 x float>, <8 x float>* %vaddr, align 1
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_17
-; CHECK: vmovdqa32{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 32
@@ -155,10 +175,13 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_18
-; CHECK: vmovdqu32{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 1
@@ -166,10 +189,13 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_19
-; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 32
@@ -177,10 +203,13 @@ define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_20
-; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
+; CHECK-LABEL: test_256_20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
%r = load <8 x i32>, <8 x i32>* %vaddr, align 1
@@ -188,10 +217,13 @@ define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
ret <8 x i32>%res
}
-; CHECK-LABEL: test_256_21
-; CHECK: vmovdqa64{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 32
@@ -199,10 +231,13 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_22
-; CHECK: vmovdqu64{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 1
@@ -210,10 +245,13 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_23
-; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 32
@@ -221,10 +259,13 @@ define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_24
-; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
%r = load <4 x i64>, <4 x i64>* %vaddr, align 1
@@ -232,10 +273,14 @@ define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
ret <4 x i64>%res
}
-; CHECK-LABEL: test_256_25
-; CHECK: vmovaps{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 32
@@ -243,10 +288,14 @@ define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_26
-; CHECK: vmovups{{.*{%k[1-7]} }}
-; CHECK: ret
define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
+; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 1
@@ -254,10 +303,14 @@ define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_27
-; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 32
@@ -265,10 +318,14 @@ define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_28
-; CHECK: vmovups{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
+; CHECK-LABEL: test_256_28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc2,0xc9,0x04]
+; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
%r = load <8 x float>, <8 x float>* %vaddr, align 1
@@ -276,10 +333,13 @@ define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
ret <8 x float>%res
}
-; CHECK-LABEL: test_256_29
-; CHECK: vmovapd{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_29:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 32
@@ -287,10 +347,13 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_30
-; CHECK: vmovupd{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_30:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2 ## encoding: [0x62,0xf1,0x6d,0x28,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 1
@@ -298,10 +361,13 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_31
-; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 32
@@ -309,10 +375,13 @@ define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
ret <4 x double>%res
}
-; CHECK-LABEL: test_256_32
-; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
+; CHECK-LABEL: test_256_32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %ymm1, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
%r = load <4 x double>, <4 x double>* %vaddr, align 1
@@ -320,154 +389,173 @@ define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
ret <4 x double>%res
}
-; CHECK-LABEL: test_128_1
-; CHECK: vmovdqu32
-; CHECK: ret
define <4 x i32> @test_128_1(i8 * %addr) {
+; CHECK-LABEL: test_128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
%res = load <4 x i32>, <4 x i32>* %vaddr, align 1
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_2
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test_128_2(i8 * %addr) {
+; CHECK-LABEL: test_128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
%res = load <4 x i32>, <4 x i32>* %vaddr, align 16
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_3
-; CHECK: vmovdqa64
-; CHECK: ret
define void @test_128_3(i8 * %addr, <2 x i64> %data) {
+; CHECK-LABEL: test_128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
store <2 x i64>%data, <2 x i64>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_4
-; CHECK: vmovdqu32
-; CHECK: ret
define void @test_128_4(i8 * %addr, <4 x i32> %data) {
+; CHECK-LABEL: test_128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu32 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7e,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
store <4 x i32>%data, <4 x i32>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_5
-; CHECK: vmovdqa32
-; CHECK: ret
define void @test_128_5(i8 * %addr, <4 x i32> %data) {
+; CHECK-LABEL: test_128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa32 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7d,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
store <4 x i32>%data, <4 x i32>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_6
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test_128_6(i8 * %addr) {
+; CHECK-LABEL: test_128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
%res = load <2 x i64>, <2 x i64>* %vaddr, align 16
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_7
-; CHECK: vmovdqu64
-; CHECK: ret
define void @test_128_7(i8 * %addr, <2 x i64> %data) {
+; CHECK-LABEL: test_128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfe,0x08,0x7f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
store <2 x i64>%data, <2 x i64>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_8
-; CHECK: vmovdqu64
-; CHECK: ret
define <2 x i64> @test_128_8(i8 * %addr) {
+; CHECK-LABEL: test_128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
%res = load <2 x i64>, <2 x i64>* %vaddr, align 1
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_9
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_9(i8 * %addr, <2 x double> %data) {
+; CHECK-LABEL: test_128_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x08,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
store <2 x double>%data, <2 x double>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_10
-; CHECK: vmovapd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <2 x double> @test_128_10(i8 * %addr) {
+; CHECK-LABEL: test_128_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
%res = load <2 x double>, <2 x double>* %vaddr, align 16
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_11
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_11(i8 * %addr, <4 x float> %data) {
+; CHECK-LABEL: test_128_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x08,0x29,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
store <4 x float>%data, <4 x float>* %vaddr, align 16
ret void
}
-; CHECK-LABEL: test_128_12
-; CHECK: vmovaps {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x float> @test_128_12(i8 * %addr) {
+; CHECK-LABEL: test_128_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
%res = load <4 x float>, <4 x float>* %vaddr, align 16
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_13
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_13(i8 * %addr, <2 x double> %data) {
+; CHECK-LABEL: test_128_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd %xmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
store <2 x double>%data, <2 x double>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_14
-; CHECK: vmovupd {{.*}} ## encoding: [0x62
-; CHECK: ret
define <2 x double> @test_128_14(i8 * %addr) {
+; CHECK-LABEL: test_128_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
%res = load <2 x double>, <2 x double>* %vaddr, align 1
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_15
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define void @test_128_15(i8 * %addr, <4 x float> %data) {
+; CHECK-LABEL: test_128_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x08,0x11,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
store <4 x float>%data, <4 x float>* %vaddr, align 1
ret void
}
-; CHECK-LABEL: test_128_16
-; CHECK: vmovups {{.*}} ## encoding: [0x62
-; CHECK: ret
define <4 x float> @test_128_16(i8 * %addr) {
+; CHECK-LABEL: test_128_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovups (%rdi), %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
%res = load <4 x float>, <4 x float>* %vaddr, align 1
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_17
-; CHECK: vmovdqa32{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_17:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 16
@@ -475,10 +563,13 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_18
-; CHECK: vmovdqu32{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_18:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 1
@@ -486,10 +577,13 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_19
-; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_19:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 16
@@ -497,10 +591,13 @@ define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_20
-; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
%r = load <4 x i32>, <4 x i32>* %vaddr, align 1
@@ -508,10 +605,13 @@ define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
ret <4 x i32>%res
}
-; CHECK-LABEL: test_128_21
-; CHECK: vmovdqa64{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 16
@@ -519,10 +619,13 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_22
-; CHECK: vmovdqu64{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 1
@@ -530,10 +633,13 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_23
-; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 16
@@ -541,10 +647,13 @@ define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_24
-; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_24:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
%r = load <2 x i64>, <2 x i64>* %vaddr, align 1
@@ -552,10 +661,13 @@ define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
ret <2 x i64>%res
}
-; CHECK-LABEL: test_128_25
-; CHECK: vmovaps{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_25:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 16
@@ -563,10 +675,13 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_26
-; CHECK: vmovups{{.*{%k[1-7]} }}
-; CHECK: ret
define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_26:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 1
@@ -574,10 +689,13 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_27
-; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_27:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 16
@@ -585,10 +703,13 @@ define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_28
-; CHECK: vmovups{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
+; CHECK-LABEL: test_128_28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
%r = load <4 x float>, <4 x float>* %vaddr, align 1
@@ -596,10 +717,13 @@ define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
ret <4 x float>%res
}
-; CHECK-LABEL: test_128_29
-; CHECK: vmovapd{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_29:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 16
@@ -607,10 +731,13 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_30
-; CHECK: vmovupd{{.*{%k[1-7]} }}
-; CHECK: ret
define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_30:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2 ## encoding: [0x62,0xf1,0x6d,0x08,0xef,0xd2]
+; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
+; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 1
@@ -618,10 +745,13 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_31
-; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 16
@@ -629,10 +759,13 @@ define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
ret <2 x double>%res
}
-; CHECK-LABEL: test_128_32
-; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
-; CHECK: ret
define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
+; CHECK-LABEL: test_128_32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxord %xmm1, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x75,0x08,0xef,0xc9]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x07]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
%r = load <2 x double>, <2 x double>* %vaddr, align 1
diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll
new file mode 100644
index 000000000000..dec6239fafc6
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+avx512vl| FileCheck %s
+
+declare void @func_f32(float)
+define <8 x float> @_256_broadcast_ss_spill(float %x) {
+; CHECK-LABEL: _256_broadcast_ss_spill:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: callq func_f32
+; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %ymm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %a = fadd float %x, %x
+ call void @func_f32(float %a)
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <4 x float> @_128_broadcast_ss_spill(float %x) {
+; CHECK-LABEL: _128_broadcast_ss_spill:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT: callq func_f32
+; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %a = fadd float %x, %x
+ call void @func_f32(float %a)
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+declare void @func_f64(double)
+define <4 x double> @_256_broadcast_sd_spill(double %x) {
+; CHECK-LABEL: _256_broadcast_sd_spill:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .Ltmp2:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: vaddsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT: callq func_f64
+; CHECK-NEXT: vbroadcastsd (%rsp), %ymm0 # 8-byte Folded Reload
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %a = fadd double %x, %x
+ call void @func_f64(double %a)
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
+
+define <8 x float> @_inreg8xfloat(float %a) {
+; CHECK-LABEL: _inreg8xfloat:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %c
+}
+
+define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1) {
+; CHECK-LABEL: _ss8xfloat_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %ymm3, %ymm3, %ymm3
+; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x float> %c, <8 x float> %i
+ ret <8 x float> %r
+}
+
+define <8 x float> @_ss8xfloat_maskz(float %a, <8 x i32> %mask1) {
+; CHECK-LABEL: _ss8xfloat_maskz:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x float> undef, float %a, i32 0
+ %c = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x float> %c, <8 x float> zeroinitializer
+ ret <8 x float> %r
+}
+
+define <4 x float> @_inreg4xfloat(float %a) {
+; CHECK-LABEL: _inreg4xfloat:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %c
+}
+
+define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xfloat_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x float> %c, <4 x float> %i
+ ret <4 x float> %r
+}
+
+define <4 x float> @_ss4xfloat_maskz(float %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xfloat_maskz:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x float> undef, float %a, i32 0
+ %c = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x float> %c, <4 x float> zeroinitializer
+ ret <4 x float> %r
+}
+
+define <4 x double> @_inreg4xdouble(double %a) {
+; CHECK-LABEL: _inreg4xdouble:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %c
+}
+
+define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xdouble_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x double> %c, <4 x double> %i
+ ret <4 x double> %r
+}
+
+define <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) {
+; CHECK-LABEL: _ss4xdouble_maskz:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+ %b = insertelement <4 x double> undef, double %a, i32 0
+ %c = shufflevector <4 x double> %b, <4 x double> undef, <4 x i32> zeroinitializer
+ %r = select <4 x i1> %mask, <4 x double> %c, <4 x double> zeroinitializer
+ ret <4 x double> %r
+}
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index aed8cb1cf559..62c8a26d1e60 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -1,94 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; CHECK-LABEL: test256_1
-; CHECK: vpcmpeqq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: test256_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
ret <4 x i64> %max
}
-; CHECK-LABEL: test256_2
-; CHECK: vpcmpgtq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
+; CHECK-LABEL: test256_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
-; CHECK-LABEL: @test256_3
-; CHECK: vpcmpled {{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
+; CHECK-LABEL: test256_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1
+; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_4
-; CHECK: vpcmpnleuq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
+; CHECK-LABEL: test256_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
ret <4 x i64> %max
}
-; CHECK-LABEL: test256_5
-; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
+; CHECK-LABEL: test256_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_6
-; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sgt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_7
-; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sle <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_8
-; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test256_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp ule <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_9
-; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+; CHECK-LABEL: test256_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i32> %x1, %y1
%mask0 = icmp eq <8 x i32> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -96,11 +107,13 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32>
ret <8 x i32> %max
}
-; CHECK-LABEL: @test256_10
-; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+; CHECK-LABEL: test256_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%mask0 = icmp sle <4 x i64> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -108,11 +121,13 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64
ret <4 x i64> %max
}
-; CHECK-LABEL: @test256_11
-; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+; CHECK-LABEL: test256_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <4 x i64> %x1, %y1
%y = load <4 x i64>, <4 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <4 x i64> %x, %y
@@ -121,11 +136,13 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
ret <4 x i64> %max
}
-; CHECK-LABEL: @test256_12
-; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+; CHECK-LABEL: test256_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask0 = icmp ule <8 x i32> %x, %y
@@ -134,11 +151,12 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_13
-; CHECK: vpcmpeqq (%rdi){1to4}, %ymm
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
+; CHECK-LABEL: test256_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
%y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -147,11 +165,12 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind
ret <4 x i64> %max
}
-; CHECK-LABEL: test256_14
-; CHECK: vpcmpled (%rdi){1to8}, %ymm
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
+; CHECK-LABEL: test256_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
%y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -160,11 +179,13 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_15
-; CHECK: vpcmpgtd (%rdi){1to8}, %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+; CHECK-LABEL: test256_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
@@ -175,11 +196,13 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32
ret <8 x i32> %max
}
-; CHECK-LABEL: test256_16
-; CHECK: vpcmpgtq (%rdi){1to4}, %ymm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+; CHECK-LABEL: test256_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
@@ -190,95 +213,105 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64
ret <4 x i64> %max
}
-; CHECK-LABEL: test128_1
-; CHECK: vpcmpeqq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
+; CHECK-LABEL: test128_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp eq <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
ret <2 x i64> %max
}
-; CHECK-LABEL: test128_2
-; CHECK: vpcmpgtq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
+; CHECK-LABEL: test128_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sgt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
-; CHECK-LABEL: @test128_3
-; CHECK: vpcmpled {{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
+; CHECK-LABEL: test128_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1
+; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp sge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_4
-; CHECK: vpcmpnleuq {{.*%k[0-7]}}
-; CHECK: vmovdqa64 {{.*}}%k1
-; CHECK: ret
define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
+; CHECK-LABEL: test128_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask = icmp ugt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
ret <2 x i64> %max
}
-; CHECK-LABEL: test128_5
-; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
+; CHECK-LABEL: test128_5:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_6
-; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_6:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sgt <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_7
-; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sle <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_8
-; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test128_8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ule <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_9
-; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+; CHECK-LABEL: test128_9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp eq <4 x i32> %x1, %y1
%mask0 = icmp eq <4 x i32> %x, %y
%mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
@@ -286,11 +319,13 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32>
ret <4 x i32> %max
}
-; CHECK-LABEL: @test128_10
-; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+; CHECK-LABEL: test128_10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%mask0 = icmp sle <2 x i64> %x, %y
%mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
@@ -298,11 +333,13 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64
ret <2 x i64> %max
}
-; CHECK-LABEL: @test128_11
-; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+; CHECK-LABEL: test128_11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sgt <2 x i64> %x1, %y1
%y = load <2 x i64>, <2 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <2 x i64> %x, %y
@@ -311,11 +348,13 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
ret <2 x i64> %max
}
-; CHECK-LABEL: @test128_12
-; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+; CHECK-LABEL: test128_12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask0 = icmp ule <4 x i32> %x, %y
@@ -324,11 +363,12 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_13
-; CHECK: vpcmpeqq (%rdi){1to2}, %xmm
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
+; CHECK-LABEL: test128_13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
%y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
@@ -337,11 +377,12 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind
ret <2 x i64> %max
}
-; CHECK-LABEL: test128_14
-; CHECK: vpcmpled (%rdi){1to4}, %xmm
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
+; CHECK-LABEL: test128_14:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
%y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -350,11 +391,13 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_15
-; CHECK: vpcmpgtd (%rdi){1to4}, %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa32
-; CHECK: ret
define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+; CHECK-LABEL: test128_15:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
@@ -365,11 +408,13 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32
ret <4 x i32> %max
}
-; CHECK-LABEL: test128_16
-; CHECK: vpcmpgtq (%rdi){1to2}, %xmm{{.*{%k[1-7]}}}
-; CHECK: vmovdqa64
-; CHECK: ret
define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+; CHECK-LABEL: test128_16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1
+; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
diff --git a/test/CodeGen/X86/base-pointer-and-cmpxchg.ll b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
new file mode 100644
index 000000000000..8de6d64428e3
--- /dev/null
+++ b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_64 %s
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=true -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=USE_BASE --check-prefix=USE_BASE_32 %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -mattr=+cx16 -x86-use-base-pointer=false -stackrealign -stack-alignment=32 %s -o - | FileCheck --check-prefix=CHECK --check-prefix=DONT_USE_BASE %s
+
+; This function uses dynamic allocated stack to force the use
+; of a frame pointer.
+; The inline asm clobbers a bunch of registers to make sure
+; the frame pointer will need to be used (for spilling in that case).
+;
+; Then, we check that when we use rbx as the base pointer,
+; we do not use cmpxchg, since using that instruction requires
+; to clobbers rbx to set the arguments of the instruction and when
+; rbx is used as the base pointer, RA cannot fix the code for us.
+;
+; CHECK-LABEL: cmp_and_swap16:
+; Check that we actually use rbx.
+; gnux32 use the 32bit variant of the registers.
+; USE_BASE_64: movq %rsp, %rbx
+; USE_BASE_32: movl %esp, %ebx
+;
+; Make sure the base pointer is saved before the RBX argument for
+; cmpxchg16b is set.
+;
+; Because of how the test is written, we spill SAVE_RBX.
+; However, it would have been perfectly fine to just keep it in register.
+; USE_BASE: movq %rbx, [[SAVE_RBX_SLOT:[0-9]*\(%[er]bx\)]]
+;
+; SAVE_RBX must be in register before we clobber rbx.
+; It is fine to use any register but rbx and the ones defined and use
+; by cmpxchg. Since such regex would be complicated to write, just stick
+; to the numbered registers. The bottom line is: if this test case fails
+; because of that regex, this is likely just the regex being too conservative.
+; USE_BASE: movq [[SAVE_RBX_SLOT]], [[SAVE_RBX:%r[0-9]+]]
+;
+; USE_BASE: movq {{[^ ]+}}, %rbx
+; USE_BASE-NEXT: cmpxchg16b
+; USE_BASE-NEXT: movq [[SAVE_RBX]], %rbx
+;
+; DONT_USE_BASE-NOT: movq %rsp, %rbx
+; DONT_USE_BASE-NOT: movl %esp, %ebx
+; DONT_USE_BASE: cmpxchg
+define i1 @cmp_and_swap16(i128 %a, i128 %b, i128* %addr, i32 %n) {
+ %dummy = alloca i32, i32 %n
+tail call void asm sideeffect "nop", "~{rax},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ %cmp = cmpxchg i128* %addr, i128 %a, i128 %b seq_cst seq_cst
+ %res = extractvalue { i128, i1 } %cmp, 1
+ %idx = getelementptr i32, i32* %dummy, i32 5
+ store i32 %n, i32* %idx
+ ret i1 %res
+}
diff --git a/test/CodeGen/X86/bit-piece-comment.ll b/test/CodeGen/X86/bit-piece-comment.ll
index 6ce858b11dcf..9ebe5bc6d5af 100644
--- a/test/CodeGen/X86/bit-piece-comment.ll
+++ b/test/CodeGen/X86/bit-piece-comment.ll
@@ -39,11 +39,10 @@ attributes #1 = { nounwind readnone }
!llvm.module.flags = !{!16, !17}
!llvm.ident = !{!18}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256088) (llvm/trunk 256097)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 256088) (llvm/trunk 256097)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "test.cpp", directory: "/mnt/extra")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "fn1", linkageName: "_Z3fn1v", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, variables: !7)
+!4 = distinct !DISubprogram(name: "fn1", linkageName: "_Z3fn1v", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !7)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{!8}
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
index e3bc8ace38ab..f1b325a03ebd 100644
--- a/test/CodeGen/X86/bitreverse.ll
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -1,22 +1,390 @@
-; RUN: llc -march=x86 %s -o - | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s
; These tests just check that the plumbing is in place for @llvm.bitreverse. The
; actual output is massive at the moment as llvm.bitreverse is not yet legal.
declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
-define <2 x i16> @f(<2 x i16> %a) {
-; CHECK-LABEL: f:
-; CHECK: shll
+define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
+; CHECK-LABEL: test_bitreverse_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shll $15, %ecx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: shll $13, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $4, %ecx
+; CHECK-NEXT: shll $11, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: shll $9, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: shll $7, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $32, %ecx
+; CHECK-NEXT: shll $5, %ecx
+; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $64, %esi
+; CHECK-NEXT: shll $3, %esi
+; CHECK-NEXT: leal (%eax,%eax), %edi
+; CHECK-NEXT: andl $256, %edi # imm = 0x100
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl %esi
+; CHECK-NEXT: andl $128, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: shrl $3, %edi
+; CHECK-NEXT: andl $64, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $5, %esi
+; CHECK-NEXT: andl $32, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: shrl $7, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $9, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: shrl $11, %edi
+; CHECK-NEXT: andl $4, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $13, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: shrl $15, %eax
+; CHECK-NEXT: orl %esi, %eax
+; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: shll $15, %ecx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: shll $13, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: andl $4, %ecx
+; CHECK-NEXT: shll $11, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: shll $9, %esi
+; CHECK-NEXT: orl %ecx, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: shll $7, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: andl $32, %ecx
+; CHECK-NEXT: shll $5, %ecx
+; CHECK-NEXT: orl %edi, %ecx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: andl $64, %esi
+; CHECK-NEXT: shll $3, %esi
+; CHECK-NEXT: leal (%edx,%edx), %edi
+; CHECK-NEXT: andl $256, %edi # imm = 0x100
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl %esi
+; CHECK-NEXT: andl $128, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shrl $3, %edi
+; CHECK-NEXT: andl $64, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $5, %esi
+; CHECK-NEXT: andl $32, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shrl $7, %edi
+; CHECK-NEXT: andl $16, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $9, %esi
+; CHECK-NEXT: andl $8, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shrl $11, %edi
+; CHECK-NEXT: andl $4, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shrl $13, %esi
+; CHECK-NEXT: andl $2, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: shrl $15, %edx
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: retl
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
ret <2 x i16> %b
}
+declare i24 @llvm.bitreverse.i24(i24) readnone
+
+define i24 @test_bitreverse_i24(i24 %a) nounwind {
+; CHECK-LABEL: test_bitreverse_i24:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shll $31, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $2, %edx
+; CHECK-NEXT: shll $29, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $4, %ecx
+; CHECK-NEXT: shll $27, %ecx
+; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $8, %edx
+; CHECK-NEXT: shll $25, %edx
+; CHECK-NEXT: orl %ecx, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: andl $16, %esi
+; CHECK-NEXT: shll $23, %esi
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: andl $32, %ecx
+; CHECK-NEXT: shll $21, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: andl $64, %edx
+; CHECK-NEXT: shll $19, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $17, %esi
+; CHECK-NEXT: andl $16777216, %esi # imm = 0x1000000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $15, %edx
+; CHECK-NEXT: andl $8388608, %edx # imm = 0x800000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $13, %esi
+; CHECK-NEXT: andl $4194304, %esi # imm = 0x400000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $11, %edx
+; CHECK-NEXT: andl $2097152, %edx # imm = 0x200000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $9, %esi
+; CHECK-NEXT: andl $1048576, %esi # imm = 0x100000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shll $7, %edx
+; CHECK-NEXT: andl $524288, %edx # imm = 0x80000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shll $5, %esi
+; CHECK-NEXT: andl $262144, %esi # imm = 0x40000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: leal (,%eax,8), %edx
+; CHECK-NEXT: andl $131072, %edx # imm = 0x20000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: leal (%eax,%eax), %esi
+; CHECK-NEXT: andl $65536, %esi # imm = 0x10000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl %edx
+; CHECK-NEXT: andl $32768, %edx # imm = 0x8000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $3, %esi
+; CHECK-NEXT: andl $16384, %esi # imm = 0x4000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $5, %edx
+; CHECK-NEXT: andl $8192, %edx # imm = 0x2000
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $7, %esi
+; CHECK-NEXT: andl $4096, %esi # imm = 0x1000
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $9, %edx
+; CHECK-NEXT: andl $2048, %edx # imm = 0x800
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: shrl $11, %esi
+; CHECK-NEXT: andl $1024, %esi # imm = 0x400
+; CHECK-NEXT: orl %edx, %esi
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrl $13, %edx
+; CHECK-NEXT: andl $512, %edx # imm = 0x200
+; CHECK-NEXT: orl %esi, %edx
+; CHECK-NEXT: shrl $15, %eax
+; CHECK-NEXT: andl $256, %eax # imm = 0x100
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: shrl $8, %eax
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: retl
+ %b = call i24 @llvm.bitreverse.i24(i24 %a)
+ ret i24 %b
+}
+
declare i8 @llvm.bitreverse.i8(i8) readnone
-define i8 @g(i8 %a) {
-; CHECK-LABEL: g:
-; CHECK: shlb
+define i8 @test_bitreverse_i8(i8 %a) {
+; CHECK-LABEL: test_bitreverse_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shlb $7, %cl
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shlb $5, %dl
+; CHECK-NEXT: andb $64, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shlb $3, %ah
+; CHECK-NEXT: andb $32, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: addb %dl, %dl
+; CHECK-NEXT: andb $16, %dl
+; CHECK-NEXT: orb %ah, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shrb %ah
+; CHECK-NEXT: andb $8, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shrb $3, %dl
+; CHECK-NEXT: andb $4, %dl
+; CHECK-NEXT: orb %ah, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shrb $5, %ah
+; CHECK-NEXT: andb $2, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: shrb $7, %al
+; CHECK-NEXT: orb %ah, %al
+; CHECK-NEXT: orb %cl, %al
+; CHECK-NEXT: retl
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
}
+
+declare i4 @llvm.bitreverse.i4(i4) readnone
+
+define i4 @test_bitreverse_i4(i4 %a) {
+; CHECK-LABEL: test_bitreverse_i4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shlb $7, %cl
+; CHECK-NEXT: movl %eax, %edx
+; CHECK-NEXT: shlb $5, %dl
+; CHECK-NEXT: andb $64, %dl
+; CHECK-NEXT: movb %al, %ah
+; CHECK-NEXT: shlb $3, %ah
+; CHECK-NEXT: andb $32, %ah
+; CHECK-NEXT: orb %dl, %ah
+; CHECK-NEXT: addb %al, %al
+; CHECK-NEXT: andb $16, %al
+; CHECK-NEXT: orb %ah, %al
+; CHECK-NEXT: orb %cl, %al
+; CHECK-NEXT: shrb $4, %al
+; CHECK-NEXT: retl
+ %b = call i4 @llvm.bitreverse.i4(i4 %a)
+ ret i4 %b
+}
+
+; These tests check that bitreverse(constant) calls are folded
+
+define <2 x i16> @fold_v2i16() {
+; CHECK-LABEL: fold_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: movw $-4096, %ax # imm = 0xF000
+; CHECK-NEXT: movw $240, %dx
+; CHECK-NEXT: retl
+ %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
+ ret <2 x i16> %b
+}
+
+define i24 @fold_i24() {
+; CHECK-LABEL: fold_i24:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $2048, %eax # imm = 0x800
+; CHECK-NEXT: retl
+ %b = call i24 @llvm.bitreverse.i24(i24 4096)
+ ret i24 %b
+}
+
+define i8 @fold_i8() {
+; CHECK-LABEL: fold_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb $-16, %al
+; CHECK-NEXT: retl
+ %b = call i8 @llvm.bitreverse.i8(i8 15)
+ ret i8 %b
+}
+
+define i4 @fold_i4() {
+; CHECK-LABEL: fold_i4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb $1, %al
+; CHECK-NEXT: retl
+ %b = call i4 @llvm.bitreverse.i4(i4 8)
+ ret i4 %b
+}
+
+; These tests check that bitreverse(bitreverse()) calls are removed
+
+define i8 @identity_i8(i8 %a) {
+; CHECK-LABEL: identity_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT: retl
+ %b = call i8 @llvm.bitreverse.i8(i8 %a)
+ %c = call i8 @llvm.bitreverse.i8(i8 %b)
+ ret i8 %c
+}
+
+define <2 x i16> @identity_v2i16(<2 x i16> %a) {
+; CHECK-LABEL: identity_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: retl
+ %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
+ %c = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b)
+ ret <2 x i16> %c
+}
+
+; These tests check that bitreverse(undef) calls are removed
+
+define i8 @undef_i8() {
+; CHECK-LABEL: undef_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: retl
+ %b = call i8 @llvm.bitreverse.i8(i8 undef)
+ ret i8 %b
+}
+
+define <2 x i16> @undef_v2i16() {
+; CHECK-LABEL: undef_v2i16:
+; CHECK: # BB#0:
+; CHECK-NEXT: retl
+ %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
+ ret <2 x i16> %b
+}
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index 89defa956a45..0e790864db49 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -7,15 +7,15 @@ define i32 @test_ifchains(i32 %i, i32* %a, i32 %b) {
; that is not expected to run.
; CHECK-LABEL: test_ifchains:
; CHECK: %entry
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else1
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else2
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else3
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %else4
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %exit
; CHECK: %then1
; CHECK: %then2
@@ -81,11 +81,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
; Check that we sink cold loop blocks after the hot loop body.
; CHECK-LABEL: test_loop_cold_blocks:
; CHECK: %entry
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %unlikely1
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %unlikely2
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %body1
; CHECK: %body2
; CHECK: %body3
@@ -242,7 +242,7 @@ define i32 @test_loop_align(i32 %i, i32* %a) {
; pass.
; CHECK-LABEL: test_loop_align:
; CHECK: %entry
-; CHECK: .align [[ALIGN:[0-9]+]],
+; CHECK: .p2align [[ALIGN:[0-9]+]],
; CHECK-NEXT: %body
; CHECK: %exit
@@ -267,11 +267,11 @@ define i32 @test_nested_loop_align(i32 %i, i32* %a, i32* %b) {
; Check that we provide nested loop body alignment.
; CHECK-LABEL: test_nested_loop_align:
; CHECK: %entry
-; CHECK: .align [[ALIGN]],
+; CHECK: .p2align [[ALIGN]],
; CHECK-NEXT: %loop.body.1
-; CHECK: .align [[ALIGN]],
+; CHECK: .p2align [[ALIGN]],
; CHECK-NEXT: %inner.loop.body
-; CHECK-NOT: .align
+; CHECK-NOT: .p2align
; CHECK: %exit
entry:
@@ -463,26 +463,24 @@ exit:
}
define void @fpcmp_unanalyzable_branch(i1 %cond) {
-; This function's CFG contains an unanalyzable branch that is likely to be
-; split due to having a different high-probability predecessor.
-; CHECK: fpcmp_unanalyzable_branch
-; CHECK: %entry
-; CHECK: %exit
-; CHECK-NOT: %if.then
-; CHECK-NOT: %if.end
-; CHECK-NOT: jne
-; CHECK-NOT: jnp
-; CHECK: jne
-; CHECK-NEXT: jnp
-; CHECK-NEXT: %if.then
+; This function's CFG contains an once-unanalyzable branch (une on floating
+; points). As now it becomes analyzable, we should get best layout in which each
+; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
+; fall-through.
+; CHECK-LABEL: fpcmp_unanalyzable_branch:
+; CHECK: # BB#0: # %entry
+; CHECK: # BB#1: # %entry.if.then_crit_edge
+; CHECK: .LBB10_4: # %if.then
+; CHECK: .LBB10_5: # %if.end
+; CHECK: # BB#3: # %exit
+; CHECK: jne .LBB10_4
+; CHECK-NEXT: jnp .LBB10_5
+; CHECK-NEXT: jmp .LBB10_4
entry:
; Note that this branch must be strongly biased toward
; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for
-; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that
-; chain which would violate the unanalyzable branch in 'exit', but we won't even
-; try this trick unless 'if.then' is believed to almost always be reached from
-; 'entry.if.then_crit_edge'.
+; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'.
br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1
entry.if.then_crit_edge:
@@ -494,7 +492,7 @@ lor.lhs.false:
exit:
%cmp.i = fcmp une double 0.000000e+00, undef
- br i1 %cmp.i, label %if.then, label %if.end
+ br i1 %cmp.i, label %if.then, label %if.end, !prof !3
if.then:
%0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ]
@@ -507,6 +505,7 @@ if.end:
}
!1 = !{!"branch_weights", i32 1000, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 1000}
declare i32 @f()
declare i32 @g()
@@ -604,10 +603,8 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
;
; CHECK: test_unnatural_cfg_backwards_inner_loop
; CHECK: %entry
-; CHECK: [[BODY:# BB#[0-9]+]]:
; CHECK: %loop2b
; CHECK: %loop1
-; CHECK: %loop2a
entry:
br i1 undef, label %loop2a, label %body
@@ -665,11 +662,14 @@ define void @unanalyzable_branch_to_best_succ(i1 %cond) {
; Ensure that we can handle unanalyzable branches where the destination block
; gets selected as the optimal successor to merge.
;
+; This branch is now analyzable and hence the destination block becomes the
+; hotter one. The right order is entry->bar->exit->foo.
+;
; CHECK: unanalyzable_branch_to_best_succ
; CHECK: %entry
-; CHECK: %foo
; CHECK: %bar
; CHECK: %exit
+; CHECK: %foo
entry:
; Bias this branch toward bar to ensure we form that chain.
@@ -943,18 +943,18 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
; CHECK: @benchmark_heapsort
; CHECK: %entry
; First rotated loop top.
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %while.end
; CHECK: %for.cond
; CHECK: %if.then
; CHECK: %if.else
; CHECK: %if.end10
; Second rotated loop top
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %if.then24
; CHECK: %while.cond.outer
; Third rotated loop top
-; CHECK: .align
+; CHECK: .p2align
; CHECK: %while.cond
; CHECK: %while.body
; CHECK: %land.lhs.true
@@ -1083,3 +1083,206 @@ exit:
%ret = phi i32 [ %val1, %then ], [ %val2, %else ]
ret i32 %ret
}
+
+; Make sure we put landingpads out of the way.
+declare i32 @pers(...)
+
+declare i32 @foo();
+
+declare i32 @bar();
+
+define i32 @test_lp(i32 %a) personality i32 (...)* @pers {
+; CHECK-LABEL: test_lp:
+; CHECK: %entry
+; CHECK: %hot
+; CHECK: %then
+; CHECK: %cold
+; CHECK: %coldlp
+; CHECK: %hotlp
+; CHECK: %lpret
+entry:
+ %0 = icmp sgt i32 %a, 1
+ br i1 %0, label %hot, label %cold, !prof !4
+
+hot:
+ %1 = invoke i32 @foo()
+ to label %then unwind label %hotlp
+
+cold:
+ %2 = invoke i32 @bar()
+ to label %then unwind label %coldlp
+
+then:
+ %3 = phi i32 [ %1, %hot ], [ %2, %cold ]
+ ret i32 %3
+
+hotlp:
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ br label %lpret
+
+coldlp:
+ %5 = landingpad { i8*, i32 }
+ cleanup
+ br label %lpret
+
+lpret:
+ %6 = phi i32 [-1, %hotlp], [-2, %coldlp]
+ %7 = add i32 %6, 42
+ ret i32 %7
+}
+
+!4 = !{!"branch_weights", i32 65536, i32 0}
+
+; Make sure that ehpad are scheduled from the least probable one
+; to the most probable one. See selectBestCandidateBlock as to why.
+declare void @clean();
+
+define void @test_flow_unwind() personality i32 (...)* @pers {
+; CHECK-LABEL: test_flow_unwind:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+; CHECK: %innerlp
+; CHECK: %outerlp
+; CHECK: %outercleanup
+entry:
+ %0 = invoke i32 @foo()
+ to label %then unwind label %outerlp
+
+then:
+ %1 = invoke i32 @bar()
+ to label %exit unwind label %innerlp
+
+exit:
+ ret void
+
+innerlp:
+ %2 = landingpad { i8*, i32 }
+ cleanup
+ br label %innercleanup
+
+outerlp:
+ %3 = landingpad { i8*, i32 }
+ cleanup
+ br label %outercleanup
+
+outercleanup:
+ %4 = phi { i8*, i32 } [%2, %innercleanup], [%3, %outerlp]
+ call void @clean()
+ resume { i8*, i32 } %4
+
+innercleanup:
+ call void @clean()
+ br label %outercleanup
+}
+
+declare void @hot_function()
+
+define void @test_hot_branch(i32* %a) {
+; Test that a hot branch that has a probability a little larger than 80% will
+; break CFG constrains when doing block placement.
+; CHECK-LABEL: test_hot_branch:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+; CHECK: %else
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %then, label %else, !prof !5
+
+then:
+ call void @hot_function()
+ br label %exit
+
+else:
+ call void @cold_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+define void @test_hot_branch_profile(i32* %a) !prof !6 {
+; Test that a hot branch that has a probability a little larger than 50% will
+; break CFG constrains when doing block placement when profile is available.
+; CHECK-LABEL: test_hot_branch_profile:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+; CHECK: %else
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %then, label %else, !prof !7
+
+then:
+ call void @hot_function()
+ br label %exit
+
+else:
+ call void @cold_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+define void @test_hot_branch_triangle_profile(i32* %a) !prof !6 {
+; Test that a hot branch that has a probability a little larger than 80% will
+; break triangle shaped CFG constrains when doing block placement if profile
+; is present.
+; CHECK-LABEL: test_hot_branch_triangle_profile:
+; CHECK: %entry
+; CHECK: %exit
+; CHECK: %then
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %exit, label %then, !prof !5
+
+then:
+ call void @hot_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+define void @test_hot_branch_triangle_profile_topology(i32* %a) !prof !6 {
+; Test that a hot branch that has a probability between 50% and 66% will not
+; break triangle shaped CFG constrains when doing block placement if profile
+; is present.
+; CHECK-LABEL: test_hot_branch_triangle_profile_topology:
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %exit
+
+entry:
+ %gep1 = getelementptr i32, i32* %a, i32 1
+ %val1 = load i32, i32* %gep1
+ %cond1 = icmp ugt i32 %val1, 1
+ br i1 %cond1, label %exit, label %then, !prof !7
+
+then:
+ call void @hot_function()
+ br label %exit
+
+exit:
+ call void @hot_function()
+ ret void
+}
+
+!5 = !{!"branch_weights", i32 84, i32 16}
+!6 = !{!"function_entry_count", i32 10}
+!7 = !{!"branch_weights", i32 60, i32 40}
diff --git a/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..8b15a1591b67
--- /dev/null
+++ b/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/bmi-builtins.c
+
+;
+; AMD Intrinsics
+;
+
+define i64 @test__andn_u64(i64 %a0, i64 %a1) {
+; X64-LABEL: test__andn_u64:
+; X64: # BB#0:
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: andq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %xor = xor i64 %a0, -1
+ %res = and i64 %xor, %a1
+ ret i64 %res
+}
+
+define i64 @test__bextr_u64(i64 %a0, i64 %a1) {
+; X64-LABEL: test__bextr_u64:
+; X64: # BB#0:
+; X64-NEXT: bextrq %rsi, %rdi, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1)
+ ret i64 %res
+}
+
+define i64 @test__blsi_u64(i64 %a0) {
+; X64-LABEL: test__blsi_u64:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %neg = sub i64 0, %a0
+ %res = and i64 %a0, %neg
+ ret i64 %res
+}
+
+define i64 @test__blsmsk_u64(i64 %a0) {
+; X64-LABEL: test__blsmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: xorq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = xor i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test__blsr_u64(i64 %a0) {
+; X64-LABEL: test__blsr_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = and i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test__tzcnt_u64(i64 %a0) {
+; X64-LABEL: test__tzcnt_u64:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %ecx
+; X64-NEXT: tzcntq %rdi, %rax
+; X64-NEXT: cmovbq %rcx, %rax
+; X64-NEXT: retq
+ %cmp = icmp ne i64 %a0, 0
+ %cttz = call i64 @llvm.cttz.i64(i64 %a0, i1 true)
+ %res = select i1 %cmp, i64 %cttz, i64 64
+ ret i64 %res
+}
+
+;
+; Intel intrinsics
+;
+
+define i64 @test_andn_u64(i64 %a0, i64 %a1) {
+; X64-LABEL: test_andn_u64:
+; X64: # BB#0:
+; X64-NEXT: xorq $-1, %rdi
+; X64-NEXT: andq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %xor = xor i64 %a0, -1
+ %res = and i64 %xor, %a1
+ ret i64 %res
+}
+
+define i64 @test_bextr_u64(i64 %a0, i32 %a1, i32 %a2) {
+; X64-LABEL: test_bextr_u64:
+; X64: # BB#0:
+; X64-NEXT: andl $255, %esi
+; X64-NEXT: andl $255, %edx
+; X64-NEXT: shll $8, %edx
+; X64-NEXT: orl %esi, %edx
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: bextrq %rax, %rdi, %rax
+; X64-NEXT: retq
+ %and1 = and i32 %a1, 255
+ %and2 = and i32 %a2, 255
+ %shl = shl i32 %and2, 8
+ %or = or i32 %and1, %shl
+ %zext = zext i32 %or to i64
+ %res = call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %zext)
+ ret i64 %res
+}
+
+define i64 @test_blsi_u64(i64 %a0) {
+; X64-LABEL: test_blsi_u64:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subq %rdi, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %neg = sub i64 0, %a0
+ %res = and i64 %a0, %neg
+ ret i64 %res
+}
+
+define i64 @test_blsmsk_u64(i64 %a0) {
+; X64-LABEL: test_blsmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: xorq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = xor i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test_blsr_u64(i64 %a0) {
+; X64-LABEL: test_blsr_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %dec = sub i64 %a0, 1
+ %res = and i64 %a0, %dec
+ ret i64 %res
+}
+
+define i64 @test_tzcnt_u64(i64 %a0) {
+; X64-LABEL: test_tzcnt_u64:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %ecx
+; X64-NEXT: tzcntq %rdi, %rax
+; X64-NEXT: cmovbq %rcx, %rax
+; X64-NEXT: retq
+ %cmp = icmp ne i64 %a0, 0
+ %cttz = call i64 @llvm.cttz.i64(i64 %a0, i1 true)
+ %res = select i1 %cmp, i64 %cttz, i64 64
+ ret i64 %res
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
diff --git a/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll b/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..2b889dd054fa
--- /dev/null
+++ b/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
@@ -0,0 +1,326 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/bmi-builtins.c
+
+;
+; AMD Intrinsics
+;
+
+define i16 @test__tzcnt_u16(i16 %a0) {
+; X32-LABEL: test__tzcnt_u16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzwl %ax, %ecx
+; X32-NEXT: cmpl $0, %ecx
+; X32-NEXT: jne .LBB0_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: retl
+; X32-NEXT: .LBB0_1:
+; X32-NEXT: tzcntw %ax, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__tzcnt_u16:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %cx
+; X64-NEXT: movzwl %di, %edx
+; X64-NEXT: tzcntw %dx, %ax
+; X64-NEXT: cmpl $0, %edx
+; X64-NEXT: cmovew %cx, %ax
+; X64-NEXT: retq
+ %zext = zext i16 %a0 to i32
+ %cmp = icmp ne i32 %zext, 0
+ %cttz = call i16 @llvm.cttz.i16(i16 %a0, i1 true)
+ %res = select i1 %cmp, i16 %cttz, i16 16
+ ret i16 %res
+}
+
+define i32 @test__andn_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test__andn_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $-1, %eax
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__andn_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl $-1, %edi
+; X64-NEXT: andl %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %xor = xor i32 %a0, -1
+ %res = and i32 %xor, %a1
+ ret i32 %res
+}
+
+define i32 @test__bextr_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test__bextr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__bextr_u32:
+; X64: # BB#0:
+; X64-NEXT: bextrl %esi, %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1)
+ ret i32 %res
+}
+
+define i32 @test__blsi_u32(i32 %a0) {
+; X32-LABEL: test__blsi_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsi_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subl %edi, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %neg = sub i32 0, %a0
+ %res = and i32 %a0, %neg
+ ret i32 %res
+}
+
+define i32 @test__blsmsk_u32(i32 %a0) {
+; X32-LABEL: test__blsmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: xorl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = xor i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test__blsr_u32(i32 %a0) {
+; X32-LABEL: test__blsr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsr_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = and i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test__tzcnt_u32(i32 %a0) {
+; X32-LABEL: test__tzcnt_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmpl $0, %eax
+; X32-NEXT: jne .LBB6_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: retl
+; X32-NEXT: .LBB6_1:
+; X32-NEXT: tzcntl %eax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__tzcnt_u32:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %ecx
+; X64-NEXT: tzcntl %edi, %eax
+; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: retq
+ %cmp = icmp ne i32 %a0, 0
+ %cttz = call i32 @llvm.cttz.i32(i32 %a0, i1 true)
+ %res = select i1 %cmp, i32 %cttz, i32 32
+ ret i32 %res
+}
+
+;
+; Intel intrinsics
+;
+
+define i16 @test_tzcnt_u16(i16 %a0) {
+; X32-LABEL: test_tzcnt_u16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzwl %ax, %ecx
+; X32-NEXT: cmpl $0, %ecx
+; X32-NEXT: jne .LBB7_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: retl
+; X32-NEXT: .LBB7_1:
+; X32-NEXT: tzcntw %ax, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_tzcnt_u16:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %cx
+; X64-NEXT: movzwl %di, %edx
+; X64-NEXT: tzcntw %dx, %ax
+; X64-NEXT: cmpl $0, %edx
+; X64-NEXT: cmovew %cx, %ax
+; X64-NEXT: retq
+ %zext = zext i16 %a0 to i32
+ %cmp = icmp ne i32 %zext, 0
+ %cttz = call i16 @llvm.cttz.i16(i16 %a0, i1 true)
+ %res = select i1 %cmp, i16 %cttz, i16 16
+ ret i16 %res
+}
+
+define i32 @test_andn_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test_andn_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $-1, %eax
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_andn_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl $-1, %edi
+; X64-NEXT: andl %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %xor = xor i32 %a0, -1
+ %res = and i32 %xor, %a1
+ ret i32 %res
+}
+
+define i32 @test_bextr_u32(i32 %a0, i32 %a1, i32 %a2) {
+; X32-LABEL: test_bextr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: andl $255, %ecx
+; X32-NEXT: andl $255, %eax
+; X32-NEXT: shll $8, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_bextr_u32:
+; X64: # BB#0:
+; X64-NEXT: andl $255, %esi
+; X64-NEXT: andl $255, %edx
+; X64-NEXT: shll $8, %edx
+; X64-NEXT: orl %esi, %edx
+; X64-NEXT: bextrl %edx, %edi, %eax
+; X64-NEXT: retq
+ %and1 = and i32 %a1, 255
+ %and2 = and i32 %a2, 255
+ %shl = shl i32 %and2, 8
+ %or = or i32 %and1, %shl
+ %res = call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %or)
+ ret i32 %res
+}
+
+define i32 @test_blsi_u32(i32 %a0) {
+; X32-LABEL: test_blsi_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_blsi_u32:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: subl %edi, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %neg = sub i32 0, %a0
+ %res = and i32 %a0, %neg
+ ret i32 %res
+}
+
+define i32 @test_blsmsk_u32(i32 %a0) {
+; X32-LABEL: test_blsmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_blsmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: xorl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = xor i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test_blsr_u32(i32 %a0) {
+; X32-LABEL: test_blsr_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_blsr_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %dec = sub i32 %a0, 1
+ %res = and i32 %a0, %dec
+ ret i32 %res
+}
+
+define i32 @test_tzcnt_u32(i32 %a0) {
+; X32-LABEL: test_tzcnt_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmpl $0, %eax
+; X32-NEXT: jne .LBB13_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: retl
+; X32-NEXT: .LBB13_1:
+; X32-NEXT: tzcntl %eax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_tzcnt_u32:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %ecx
+; X64-NEXT: tzcntl %edi, %eax
+; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: retq
+ %cmp = icmp ne i32 %a0, 0
+ %cttz = call i32 @llvm.cttz.i32(i32 %a0, i1 true)
+ %res = select i1 %cmp, i32 %cttz, i32 32
+ ret i32 %res
+}
+
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 8b13e960cd8f..afeba4ef2d99 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -1,218 +1,437 @@
-; RUN: llc < %s -march=x86-64 -mattr=+bmi,+bmi2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s
-declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
-declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
-declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
-declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
-define i8 @t1(i8 %x) nounwind {
+define i8 @t1(i8 %x) {
+; CHECK-LABEL: t1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: orl $256, %eax # imm = 0x100
+; CHECK-NEXT: tzcntl %eax, %eax
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 false )
ret i8 %tmp
-; CHECK-LABEL: t1:
-; CHECK: tzcntl
}
-define i16 @t2(i16 %x) nounwind {
+define i16 @t2(i16 %x) {
+; CHECK-LABEL: t2:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntw %di, %ax
+; CHECK-NEXT: retq
%tmp = tail call i16 @llvm.cttz.i16( i16 %x, i1 false )
ret i16 %tmp
-; CHECK-LABEL: t2:
-; CHECK: tzcntw
}
-define i32 @t3(i32 %x) nounwind {
+define i32 @t3(i32 %x) {
+; CHECK-LABEL: t3:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntl %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.cttz.i32( i32 %x, i1 false )
ret i32 %tmp
-; CHECK-LABEL: t3:
-; CHECK: tzcntl
}
-define i32 @tzcnt32_load(i32* %x) nounwind {
+define i32 @tzcnt32_load(i32* %x) {
+; CHECK-LABEL: tzcnt32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntl (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = tail call i32 @llvm.cttz.i32(i32 %x1, i1 false )
ret i32 %tmp
-; CHECK-LABEL: tzcnt32_load:
-; CHECK: tzcntl ({{.*}})
}
-define i64 @t4(i64 %x) nounwind {
+define i64 @t4(i64 %x) {
+; CHECK-LABEL: t4:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 false )
ret i64 %tmp
-; CHECK-LABEL: t4:
-; CHECK: tzcntq
}
-define i8 @t5(i8 %x) nounwind {
+define i8 @t5(i8 %x) {
+; CHECK-LABEL: t5:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: tzcntl %eax, %eax
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
%tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 true )
ret i8 %tmp
-; CHECK-LABEL: t5:
-; CHECK: tzcntl
}
-define i16 @t6(i16 %x) nounwind {
+define i16 @t6(i16 %x) {
+; CHECK-LABEL: t6:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntw %di, %ax
+; CHECK-NEXT: retq
%tmp = tail call i16 @llvm.cttz.i16( i16 %x, i1 true )
ret i16 %tmp
-; CHECK-LABEL: t6:
-; CHECK: tzcntw
}
-define i32 @t7(i32 %x) nounwind {
+define i32 @t7(i32 %x) {
+; CHECK-LABEL: t7:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntl %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.cttz.i32( i32 %x, i1 true )
ret i32 %tmp
-; CHECK-LABEL: t7:
-; CHECK: tzcntl
}
-define i64 @t8(i64 %x) nounwind {
+define i64 @t8(i64 %x) {
+; CHECK-LABEL: t8:
+; CHECK: # BB#0:
+; CHECK-NEXT: tzcntq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 true )
ret i64 %tmp
-; CHECK-LABEL: t8:
-; CHECK: tzcntq
}
-define i32 @andn32(i32 %x, i32 %y) nounwind readnone {
+define i32 @andn32(i32 %x, i32 %y) {
+; CHECK-LABEL: andn32:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp1 = xor i32 %x, -1
%tmp2 = and i32 %y, %tmp1
ret i32 %tmp2
-; CHECK-LABEL: andn32:
-; CHECK: andnl
}
-define i32 @andn32_load(i32 %x, i32* %y) nounwind readnone {
+define i32 @andn32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: andn32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
%y1 = load i32, i32* %y
%tmp1 = xor i32 %x, -1
%tmp2 = and i32 %y1, %tmp1
ret i32 %tmp2
-; CHECK-LABEL: andn32_load:
-; CHECK: andnl ({{.*}})
}
-define i64 @andn64(i64 %x, i64 %y) nounwind readnone {
+define i64 @andn64(i64 %x, i64 %y) {
+; CHECK-LABEL: andn64:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp1 = xor i64 %x, -1
%tmp2 = and i64 %tmp1, %y
ret i64 %tmp2
-; CHECK-LABEL: andn64:
-; CHECK: andnq
}
-define i32 @bextr32(i32 %x, i32 %y) nounwind readnone {
+; Don't choose a 'test' if an 'andn' can be used.
+define i1 @andn_cmp(i32 %x, i32 %y) {
+; CHECK-LABEL: andn_cmp:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %notx = xor i32 %x, -1
+ %and = and i32 %notx, %y
+ %cmp = icmp eq i32 %and, 0
+ ret i1 %cmp
+}
+
+; Recognize a disguised andn in the following 4 tests.
+define i1 @and_cmp1(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp1:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, %y
+ %cmp = icmp eq i32 %and, %y
+ ret i1 %cmp
+}
+
+define i1 @and_cmp2(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
+ %and = and i32 %y, %x
+ %cmp = icmp ne i32 %and, %y
+ ret i1 %cmp
+}
+
+define i1 @and_cmp3(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp3:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, %y
+ %cmp = icmp eq i32 %y, %and
+ ret i1 %cmp
+}
+
+define i1 @and_cmp4(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp4:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnl %esi, %edi, %eax
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
+ %and = and i32 %y, %x
+ %cmp = icmp ne i32 %y, %and
+ ret i1 %cmp
+}
+
+; A mask and compare against constant is ok for an 'andn' too
+; even though the BMI instruction doesn't have an immediate form.
+define i1 @and_cmp_const(i32 %x) {
+; CHECK-LABEL: and_cmp_const:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $43, %eax
+; CHECK-NEXT: andnl %eax, %edi, %eax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, 43
+ %cmp = icmp eq i32 %and, 43
+ ret i1 %cmp
+}
+
+; But don't use 'andn' if the mask is a power-of-two.
+define i1 @and_cmp_const_power_of_two(i32 %x, i32 %y) {
+; CHECK-LABEL: and_cmp_const_power_of_two:
+; CHECK: # BB#0:
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: retq
+ %shl = shl i32 1, %y
+ %and = and i32 %x, %shl
+ %cmp = icmp ne i32 %and, %shl
+ ret i1 %cmp
+}
+
+; Don't transform to 'andn' if there's another use of the 'and'.
+define i32 @and_cmp_not_one_use(i32 %x) {
+; CHECK-LABEL: and_cmp_not_one_use:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl $37, %edi
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $37, %edi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: addl %edi, %eax
+; CHECK-NEXT: retq
+ %and = and i32 %x, 37
+ %cmp = icmp eq i32 %and, 37
+ %ext = zext i1 %cmp to i32
+ %add = add i32 %and, %ext
+ ret i32 %add
+}
+
+; Verify that we're not transforming invalid comparison predicates.
+define i1 @not_an_andn1(i32 %x, i32 %y) {
+; CHECK-LABEL: not_an_andn1:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: retq
+ %and = and i32 %x, %y
+ %cmp = icmp sgt i32 %y, %and
+ ret i1 %cmp
+}
+
+define i1 @not_an_andn2(i32 %x, i32 %y) {
+; CHECK-LABEL: not_an_andn2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl %esi, %edi
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: setbe %al
+; CHECK-NEXT: retq
+ %and = and i32 %y, %x
+ %cmp = icmp ule i32 %y, %and
+ ret i1 %cmp
+}
+
+; Don't choose a 'test' if an 'andn' can be used.
+define i1 @andn_cmp_swap_ops(i64 %x, i64 %y) {
+; CHECK-LABEL: andn_cmp_swap_ops:
+; CHECK: # BB#0:
+; CHECK-NEXT: andnq %rsi, %rdi, %rax
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %notx = xor i64 %x, -1
+ %and = and i64 %y, %notx
+ %cmp = icmp eq i64 %and, 0
+ ret i1 %cmp
+}
+
+; Use a 'test' (not an 'and') because 'andn' only works for i32/i64.
+define i1 @andn_cmp_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: andn_cmp_i8:
+; CHECK: # BB#0:
+; CHECK-NEXT: notb %sil
+; CHECK-NEXT: testb %sil, %dil
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+ %noty = xor i8 %y, -1
+ %and = and i8 %x, %noty
+ %cmp = icmp eq i8 %and, 0
+ ret i1 %cmp
+}
+
+define i32 @bextr32(i32 %x, i32 %y) {
+; CHECK-LABEL: bextr32:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextrl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bextr32:
-; CHECK: bextrl
}
-define i32 @bextr32_load(i32* %x, i32 %y) nounwind readnone {
+define i32 @bextr32_load(i32* %x, i32 %y) {
+; CHECK-LABEL: bextr32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextrl %esi, (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x1, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bextr32_load:
-; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
}
-declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
-define i32 @bextr32b(i32 %x) nounwind uwtable readnone ssp {
+define i32 @bextr32b(i32 %x) uwtable ssp {
+; CHECK-LABEL: bextr32b:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, %edi, %eax
+; CHECK-NEXT: retq
%1 = lshr i32 %x, 4
%2 = and i32 %1, 4095
ret i32 %2
-; CHECK-LABEL: bextr32b:
-; CHECK: bextrl
}
-define i32 @bextr32b_load(i32* %x) nounwind uwtable readnone ssp {
+define i32 @bextr32b_load(i32* %x) uwtable ssp {
+; CHECK-LABEL: bextr32b_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, (%rdi), %eax
+; CHECK-NEXT: retq
%1 = load i32, i32* %x
%2 = lshr i32 %1, 4
%3 = and i32 %2, 4095
ret i32 %3
-; CHECK-LABEL: bextr32b_load:
-; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
}
-define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
+define i64 @bextr64(i64 %x, i64 %y) {
+; CHECK-LABEL: bextr64:
+; CHECK: # BB#0:
+; CHECK-NEXT: bextrq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: bextr64:
-; CHECK: bextrq
}
-declare i64 @llvm.x86.bmi.bextr.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
-define i64 @bextr64b(i64 %x) nounwind uwtable readnone ssp {
+define i64 @bextr64b(i64 %x) uwtable ssp {
+; CHECK-LABEL: bextr64b:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, %edi, %eax
+; CHECK-NEXT: retq
%1 = lshr i64 %x, 4
%2 = and i64 %1, 4095
ret i64 %2
-; CHECK-LABEL: bextr64b:
-; CHECK: bextrq
}
define i64 @bextr64b_load(i64* %x) {
+; CHECK-LABEL: bextr64b_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
+; CHECK-NEXT: bextrl %eax, (%rdi), %eax
+; CHECK-NEXT: retq
%1 = load i64, i64* %x, align 8
%2 = lshr i64 %1, 4
%3 = and i64 %2, 4095
ret i64 %3
-; CHECK-LABEL: bextr64b_load:
-; CHECK: bextrq {{.*}}, ({{.*}}), {{.*}}
}
define i32 @non_bextr32(i32 %x) {
+; CHECK-LABEL: non_bextr32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: shrl $2, %edi
+; CHECK-NEXT: andl $111, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
entry:
%shr = lshr i32 %x, 2
%and = and i32 %shr, 111
ret i32 %and
-; CHECK-LABEL: non_bextr32:
-; CHECK: shrl $2
-; CHECK: andl $111
}
define i64 @non_bextr64(i64 %x) {
+; CHECK-LABEL: non_bextr64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: shrq $2, %rdi
+; CHECK-NEXT: movabsq $8589934590, %rax # imm = 0x1FFFFFFFE
+; CHECK-NEXT: andq %rdi, %rax
+; CHECK-NEXT: retq
entry:
%shr = lshr i64 %x, 2
%and = and i64 %shr, 8589934590
ret i64 %and
-; CHECK-LABEL: non_bextr64:
-; CHECK: shrq $2
-; CHECK: movabsq $8589934590
-; CHECK: andq
}
-define i32 @bzhi32(i32 %x, i32 %y) nounwind readnone {
+define i32 @bzhi32(i32 %x, i32 %y) {
+; CHECK-LABEL: bzhi32:
+; CHECK: # BB#0:
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bzhi32:
-; CHECK: bzhil
}
-define i32 @bzhi32_load(i32* %x, i32 %y) nounwind readnone {
+define i32 @bzhi32_load(i32* %x, i32 %y) {
+; CHECK-LABEL: bzhi32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: bzhil %esi, (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: bzhi32_load:
-; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
}
-declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
-define i64 @bzhi64(i64 %x, i64 %y) nounwind readnone {
+define i64 @bzhi64(i64 %x, i64 %y) {
+; CHECK-LABEL: bzhi64:
+; CHECK: # BB#0:
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: bzhi64:
-; CHECK: bzhiq
}
-declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
-define i32 @bzhi32b(i32 %x, i8 zeroext %index) #0 {
+define i32 @bzhi32b(i32 %x, i8 zeroext %index) {
+; CHECK-LABEL: bzhi32b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
entry:
%conv = zext i8 %index to i32
%shl = shl i32 1, %conv
%sub = add nsw i32 %shl, -1
%and = and i32 %sub, %x
ret i32 %and
-; CHECK-LABEL: bzhi32b:
-; CHECK: bzhil
}
-define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) #0 {
+define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) {
+; CHECK-LABEL: bzhi32b_load:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, (%rdi), %eax
+; CHECK-NEXT: retq
entry:
%x = load i32, i32* %w
%conv = zext i8 %index to i32
@@ -220,173 +439,211 @@ entry:
%sub = add nsw i32 %shl, -1
%and = and i32 %sub, %x
ret i32 %and
-; CHECK-LABEL: bzhi32b_load:
-; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
}
-define i32 @bzhi32c(i32 %x, i8 zeroext %index) #0 {
+define i32 @bzhi32c(i32 %x, i8 zeroext %index) {
+; CHECK-LABEL: bzhi32c:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
entry:
%conv = zext i8 %index to i32
%shl = shl i32 1, %conv
%sub = add nsw i32 %shl, -1
%and = and i32 %x, %sub
ret i32 %and
-; CHECK-LABEL: bzhi32c:
-; CHECK: bzhil
}
-define i64 @bzhi64b(i64 %x, i8 zeroext %index) #0 {
+define i64 @bzhi64b(i64 %x, i8 zeroext %index) {
+; CHECK-LABEL: bzhi64b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
entry:
%conv = zext i8 %index to i64
%shl = shl i64 1, %conv
%sub = add nsw i64 %shl, -1
%and = and i64 %x, %sub
ret i64 %and
-; CHECK-LABEL: bzhi64b:
-; CHECK: bzhiq
}
-define i64 @bzhi64_constant_mask(i64 %x) #0 {
+define i64 @bzhi64_constant_mask(i64 %x) {
+; CHECK-LABEL: bzhi64_constant_mask:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movb $62, %al
+; CHECK-NEXT: bzhiq %rax, %rdi, %rax
+; CHECK-NEXT: retq
entry:
%and = and i64 %x, 4611686018427387903
ret i64 %and
-; CHECK-LABEL: bzhi64_constant_mask:
-; CHECK: movb $62, %al
-; CHECK: bzhiq %rax, %r[[ARG1:di|cx]], %rax
}
-define i64 @bzhi64_small_constant_mask(i64 %x) #0 {
+define i64 @bzhi64_small_constant_mask(i64 %x) {
+; CHECK-LABEL: bzhi64_small_constant_mask:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
entry:
%and = and i64 %x, 2147483647
ret i64 %and
-; CHECK-LABEL: bzhi64_small_constant_mask:
-; CHECK: andl $2147483647, %e[[ARG1]]
}
-define i32 @blsi32(i32 %x) nounwind readnone {
+define i32 @blsi32(i32 %x) {
+; CHECK-LABEL: blsi32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsil %edi, %eax
+; CHECK-NEXT: retq
%tmp = sub i32 0, %x
%tmp2 = and i32 %x, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsi32:
-; CHECK: blsil
}
-define i32 @blsi32_load(i32* %x) nounwind readnone {
+define i32 @blsi32_load(i32* %x) {
+; CHECK-LABEL: blsi32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsil (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = sub i32 0, %x1
%tmp2 = and i32 %x1, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsi32_load:
-; CHECK: blsil ({{.*}})
}
-define i64 @blsi64(i64 %x) nounwind readnone {
+define i64 @blsi64(i64 %x) {
+; CHECK-LABEL: blsi64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsiq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = sub i64 0, %x
%tmp2 = and i64 %tmp, %x
ret i64 %tmp2
-; CHECK-LABEL: blsi64:
-; CHECK: blsiq
}
-define i32 @blsmsk32(i32 %x) nounwind readnone {
+define i32 @blsmsk32(i32 %x) {
+; CHECK-LABEL: blsmsk32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsmskl %edi, %eax
+; CHECK-NEXT: retq
%tmp = sub i32 %x, 1
%tmp2 = xor i32 %x, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsmsk32:
-; CHECK: blsmskl
}
-define i32 @blsmsk32_load(i32* %x) nounwind readnone {
+define i32 @blsmsk32_load(i32* %x) {
+; CHECK-LABEL: blsmsk32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsmskl (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = sub i32 %x1, 1
%tmp2 = xor i32 %x1, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsmsk32_load:
-; CHECK: blsmskl ({{.*}})
}
-define i64 @blsmsk64(i64 %x) nounwind readnone {
+define i64 @blsmsk64(i64 %x) {
+; CHECK-LABEL: blsmsk64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsmskq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = sub i64 %x, 1
%tmp2 = xor i64 %tmp, %x
ret i64 %tmp2
-; CHECK-LABEL: blsmsk64:
-; CHECK: blsmskq
}
-define i32 @blsr32(i32 %x) nounwind readnone {
+define i32 @blsr32(i32 %x) {
+; CHECK-LABEL: blsr32:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsrl %edi, %eax
+; CHECK-NEXT: retq
%tmp = sub i32 %x, 1
%tmp2 = and i32 %x, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsr32:
-; CHECK: blsrl
}
-define i32 @blsr32_load(i32* %x) nounwind readnone {
+define i32 @blsr32_load(i32* %x) {
+; CHECK-LABEL: blsr32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsrl (%rdi), %eax
+; CHECK-NEXT: retq
%x1 = load i32, i32* %x
%tmp = sub i32 %x1, 1
%tmp2 = and i32 %x1, %tmp
ret i32 %tmp2
-; CHECK-LABEL: blsr32_load:
-; CHECK: blsrl ({{.*}})
}
-define i64 @blsr64(i64 %x) nounwind readnone {
+define i64 @blsr64(i64 %x) {
+; CHECK-LABEL: blsr64:
+; CHECK: # BB#0:
+; CHECK-NEXT: blsrq %rdi, %rax
+; CHECK-NEXT: retq
%tmp = sub i64 %x, 1
%tmp2 = and i64 %tmp, %x
ret i64 %tmp2
-; CHECK-LABEL: blsr64:
-; CHECK: blsrq
}
-define i32 @pdep32(i32 %x, i32 %y) nounwind readnone {
+define i32 @pdep32(i32 %x, i32 %y) {
+; CHECK-LABEL: pdep32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pdepl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: pdep32:
-; CHECK: pdepl
}
-define i32 @pdep32_load(i32 %x, i32* %y) nounwind readnone {
+define i32 @pdep32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: pdep32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: pdepl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
%y1 = load i32, i32* %y
%tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
ret i32 %tmp
-; CHECK-LABEL: pdep32_load:
-; CHECK: pdepl ({{.*}})
}
-declare i32 @llvm.x86.bmi.pdep.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
-define i64 @pdep64(i64 %x, i64 %y) nounwind readnone {
+define i64 @pdep64(i64 %x, i64 %y) {
+; CHECK-LABEL: pdep64:
+; CHECK: # BB#0:
+; CHECK-NEXT: pdepq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: pdep64:
-; CHECK: pdepq
}
-declare i64 @llvm.x86.bmi.pdep.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
-define i32 @pext32(i32 %x, i32 %y) nounwind readnone {
+define i32 @pext32(i32 %x, i32 %y) {
+; CHECK-LABEL: pext32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pextl %esi, %edi, %eax
+; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y)
ret i32 %tmp
-; CHECK-LABEL: pext32:
-; CHECK: pextl
}
-define i32 @pext32_load(i32 %x, i32* %y) nounwind readnone {
+define i32 @pext32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: pext32_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: pextl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
%y1 = load i32, i32* %y
%tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
ret i32 %tmp
-; CHECK-LABEL: pext32_load:
-; CHECK: pextl ({{.*}})
}
-declare i32 @llvm.x86.bmi.pext.32(i32, i32) nounwind readnone
+declare i32 @llvm.x86.bmi.pext.32(i32, i32)
-define i64 @pext64(i64 %x, i64 %y) nounwind readnone {
+define i64 @pext64(i64 %x, i64 %y) {
+; CHECK-LABEL: pext64:
+; CHECK: # BB#0:
+; CHECK-NEXT: pextq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 %y)
ret i64 %tmp
-; CHECK-LABEL: pext64:
-; CHECK: pextq
}
-declare i64 @llvm.x86.bmi.pext.64(i64, i64) nounwind readnone
+declare i64 @llvm.x86.bmi.pext.64(i64, i64)
diff --git a/test/CodeGen/X86/bool-zext.ll b/test/CodeGen/X86/bool-zext.ll
index c98ad9e36d7e..5cc758c06b5d 100644
--- a/test/CodeGen/X86/bool-zext.ll
+++ b/test/CodeGen/X86/bool-zext.ll
@@ -1,10 +1,15 @@
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
-; X64: @bar1
+; Check that the argument gets zero-extended before calling.
+; X86-LABEL: bar1
+; X86: movzbl
+; X86: calll
+; X64-LABEL: bar1
; X64: movzbl
; X64: jmp
-; WIN64: @bar1
+; WIN64-LABEL: bar1
; WIN64: movzbl
; WIN64: callq
define void @bar1(i1 zeroext %v1) nounwind ssp {
@@ -14,10 +19,11 @@ entry:
ret void
}
-; X64: @bar2
+; Check that on x86-64 the arguments are simply forwarded.
+; X64-LABEL: bar2
; X64-NOT: movzbl
; X64: jmp
-; WIN64: @bar2
+; WIN64-LABEL: bar2
; WIN64-NOT: movzbl
; WIN64: callq
define void @bar2(i8 zeroext %v1) nounwind ssp {
@@ -27,16 +33,19 @@ entry:
ret void
}
-; X64: @bar3
-; X64: callq
-; X64-NOT: movzbl
-; X64-NOT: and
-; X64: ret
-; WIN64: @bar3
-; WIN64: callq
-; WIN64-NOT: movzbl
-; WIN64-NOT: and
-; WIN64: ret
+; Check that i1 return values are not zero-extended.
+; X86-LABEL: bar3
+; X86: call
+; X86-NEXT: {{add|pop}}
+; X86-NEXT: ret
+; X64-LABEL: bar3
+; X64: call
+; X64-NEXT: {{add|pop}}
+; X64-NEXT: ret
+; WIN64-LABEL: bar3
+; WIN64: call
+; WIN64-NEXT: {{add|pop}}
+; WIN64-NEXT: ret
define zeroext i1 @bar3() nounwind ssp {
entry:
%call = call i1 @foo2() nounwind
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index fd1e73bde8cc..d3aedbb17e7d 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -2,11 +2,14 @@
; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck -check-prefix=X64_LINUX %s
; RUN: llc -mtriple=x86_64-pc-windows < %s | FileCheck -check-prefix=X64_WINDOWS %s
; RUN: llc -mtriple=x86_64-pc-windows-gnu < %s | FileCheck -check-prefix=X64_WINDOWS_GNU %s
+; RUN: llc -mtriple=x86_64-scei-ps4 < %s | FileCheck -check-prefix=PS4 %s
; X64_DARWIN: orq
+; X64_DARWIN-NEXT: jne
; X64_DARWIN-NEXT: %bb8.i329
; X64_LINUX: orq %rax, %rcx
+; X64_LINUX-NEXT: jne
; X64_LINUX-NEXT: %bb8.i329
; X64_WINDOWS: orq %rax, %rcx
@@ -15,6 +18,9 @@
; X64_WINDOWS_GNU: orq %rax, %rcx
; X64_WINDOWS_GNU-NEXT: ud2
+; PS4: orq %rax, %rcx
+; PS4-NEXT: ud2
+
@_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
@_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
index 699de22d5b56..74a0728f918d 100644
--- a/test/CodeGen/X86/break-false-dep.ll
+++ b/test/CodeGen/X86/break-false-dep.ll
@@ -64,7 +64,7 @@ declare float @llvm.sqrt.f32(float)
declare double @llvm.sqrt.f64(double)
; SSE-LABEL: loopdep1
-; SSE: for.body
+; SSE: for.body{{$}}
;
; This loop contains two cvtsi2ss instructions that update the same xmm
; register. Verify that the execution dependency fix pass breaks those
@@ -139,7 +139,7 @@ ret:
; This loop contains a cvtsi2sd instruction that has a loop-carried
; false dependency on an xmm that is modified by other scalar instructions
-; that follow it in the loop. Additionally, the source of convert is a
+; that follow it in the loop. Additionally, the source of convert is a
; memory operand. Verify the execution dependency fix pass breaks this
; dependency by inserting a xor before the convert.
@x = common global [1024 x double] zeroinitializer, align 16
diff --git a/test/CodeGen/X86/bss_pagealigned.ll b/test/CodeGen/X86/bss_pagealigned.ll
index da95aca110da..4e9f9241011c 100644
--- a/test/CodeGen/X86/bss_pagealigned.ll
+++ b/test/CodeGen/X86/bss_pagealigned.ll
@@ -15,7 +15,7 @@ define void @unxlate_dev_mem_ptr(i64 %phis, i8* %addr) nounwind {
}
@bm_pte = internal global [512 x %struct.kmem_cache_order_objects] zeroinitializer, section ".bss.page_aligned", align 4096
; CHECK: .section .bss.page_aligned,"aw",@nobits
-; CHECK-NEXT: .align 4096
+; CHECK-NEXT: .p2align 12
; CHECK-NEXT: bm_pte:
; CHECK-NEXT: .zero 4096
; CHECK-NEXT: .size bm_pte, 4096
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 5376601a95e3..6697183ab679 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,7 +1,8 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-NOSSSE3
; RUN: llc < %s -mcpu=core2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-SSSE3
-; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2
-; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE-AVX2
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2
+; RUN: llc < %s -mcpu=core-avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-WIDE-AVX --check-prefix=CHECK-WIDE-AVX2
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -29,15 +30,15 @@ define <8 x i16> @test1(<8 x i16> %v) {
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test1:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test1:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test1:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test1:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
ret <8 x i16> %r
@@ -62,15 +63,15 @@ define <4 x i32> @test2(<4 x i32> %v) {
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test2:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test2:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test2:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test2:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
ret <4 x i32> %r
@@ -97,15 +98,15 @@ define <2 x i64> @test3(<2 x i64> %v) {
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test3:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test3:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test3:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test3:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
ret <2 x i64> %r
@@ -144,15 +145,15 @@ define <16 x i16> @test4(<16 x i16> %v) {
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test4:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test4:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test4:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test4:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
ret <16 x i16> %r
@@ -187,15 +188,15 @@ define <8 x i32> @test5(<8 x i32> %v) {
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test5:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test5:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test5:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test5:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
ret <8 x i32> %r
@@ -234,15 +235,15 @@ define <4 x i64> @test6(<4 x i64> %v) {
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test6:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test6:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test6:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test6:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
ret <4 x i64> %r
@@ -271,16 +272,16 @@ define <4 x i16> @test7(<4 x i16> %v) {
; CHECK-SSSE3-NEXT: psrld $16, %xmm0
; CHECK-SSSE3-NEXT: retq
;
-; CHECK-AVX2-LABEL: test7:
-; CHECK-AVX2: # BB#0: # %entry
-; CHECK-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: retq
+; CHECK-AVX-LABEL: test7:
+; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
;
-; CHECK-WIDE-AVX2-LABEL: test7:
-; CHECK-WIDE-AVX2: # BB#0: # %entry
-; CHECK-WIDE-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX2-NEXT: retq
+; CHECK-WIDE-AVX-LABEL: test7:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
ret <4 x i16> %r
@@ -293,7 +294,7 @@ entry:
define <8 x i16> @identity_v8i16(<8 x i16> %v) {
; CHECK-ALL-LABEL: identity_v8i16:
; CHECK-ALL: # BB#0: # %entry
-; CHECK-ALL: retq
+; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
%bs2 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %bs1)
@@ -374,6 +375,11 @@ define <8 x i16> @fold_v8i16() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v8i16:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6>)
ret <8 x i16> %r
@@ -389,6 +395,11 @@ define <4 x i32> @fold_v4i32() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v4i32:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> <i32 0, i32 -1, i32 2, i32 -3>)
ret <4 x i32> %r
@@ -404,6 +415,11 @@ define <2 x i64> @fold_v2i64() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v2i64:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> <i64 255, i64 -1>)
ret <2 x i64> %r
@@ -420,6 +436,11 @@ define <16 x i16> @fold_v16i16() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v16i16:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14>)
ret <16 x i16> %r
@@ -436,6 +457,11 @@ define <8 x i32> @fold_v8i32() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v8i32:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> <i32 0, i32 1, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6>)
ret <8 x i32> %r
@@ -452,6 +478,11 @@ define <4 x i64> @fold_v4i64() {
; CHECK-AVX: # BB#0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
; CHECK-AVX-NEXT: retq
+;
+; CHECK-WIDE-AVX-LABEL: fold_v4i64:
+; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
+; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> <i64 255, i64 -1, i64 65535, i64 16776960>)
ret <4 x i64> %r
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index 036ec0acc6e8..aee4a93c6473 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=penryn | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK --check-prefix=PENTIUM4 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=AVX-512 %s
+
; PR3253
; The register+memory form of the BT instruction should be usable on
@@ -18,516 +21,950 @@
; - The and can be commuted.
define void @test2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: test2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB0_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB0_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: test2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB0_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB0_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: test2
-; CHECK: btl %ecx, %eax
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @test2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: test2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB1_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB1_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: test2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB1_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB1_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: test2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atest2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atest2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB2_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB2_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atest2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB2_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB2_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atest2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atest2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atest2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB3_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB3_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atest2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: jne .LBB3_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB3_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atest2b
-; CHECK: btl %e{{..}}, %e{{..}}
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @test3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB4_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB4_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: test3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @test3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: test3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB5_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB5_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: test3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: testne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB6_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB6_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: testne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB6_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB6_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: testne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: testne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB7_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB7_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: testne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB7_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB7_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: testne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atestne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atestne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB8_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB8_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atestne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB8_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB8_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atestne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @atestne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: atestne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB9_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB9_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: atestne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB9_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB9_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: atestne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: testne3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB10_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB10_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: testne3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @testne3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: testne3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB11_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB11_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: testne3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 0 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 0
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: query2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB12_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB12_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: query2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB12_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB12_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: query2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: query2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB13_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB13_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: query2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB13_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB13_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: query2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aquery2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aquery2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB14_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB14_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aquery2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB14_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB14_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aquery2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aquery2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aquery2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jae .LBB15_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB15_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aquery2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: testb $1, %dil
+; AVX-512-NEXT: je .LBB15_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB15_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aquery2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp eq i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB16_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB16_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp eq i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB17_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB17_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp eq i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3x(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3x:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB18_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB18_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3x
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp eq i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp eq i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @query3bx(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: query3bx:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jae .LBB19_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB19_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: query3bx
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jae
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp eq i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp eq i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: queryne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB20_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB20_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: queryne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB20_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB20_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: queryne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: queryne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB21_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB21_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: queryne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB21_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB21_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: queryne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = lshr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = lshr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aqueryne2(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aqueryne2:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB22_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB22_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aqueryne2:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB22_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB22_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aqueryne2
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, 1 ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 %tmp29, 1
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @aqueryne2b(i32 %x, i32 %n) nounwind {
+; PENTIUM4-LABEL: aqueryne2b:
+; PENTIUM4: # BB#0: # %entry
+; PENTIUM4-NEXT: btl %esi, %edi
+; PENTIUM4-NEXT: jb .LBB23_2
+; PENTIUM4-NEXT: # BB#1: # %bb
+; PENTIUM4-NEXT: pushq %rax
+; PENTIUM4-NEXT: callq foo
+; PENTIUM4-NEXT: popq %rax
+; PENTIUM4-NEXT: .LBB23_2: # %UnifiedReturnBlock
+; PENTIUM4-NEXT: retq
+;
+; AVX-512-LABEL: aqueryne2b:
+; AVX-512: # BB#0: # %entry
+; AVX-512-NEXT: movl %esi, %ecx
+; AVX-512-NEXT: shrl %cl, %edi
+; AVX-512-NEXT: andl $1, %edi
+; AVX-512-NEXT: kmovw %edi, %k0
+; AVX-512-NEXT: kxnorw %k0, %k0, %k1
+; AVX-512-NEXT: kshiftrw $15, %k1, %k1
+; AVX-512-NEXT: kxorw %k1, %k0, %k0
+; AVX-512-NEXT: kmovw %k0, %eax
+; AVX-512-NEXT: testb %al, %al
+; AVX-512-NEXT: je .LBB23_2
+; AVX-512-NEXT: # BB#1: # %bb
+; AVX-512-NEXT: pushq %rax
+; AVX-512-NEXT: callq foo
+; AVX-512-NEXT: popq %rax
+; AVX-512-NEXT: .LBB23_2: # %UnifiedReturnBlock
+; AVX-512-NEXT: retq
entry:
-; CHECK: aqueryne2b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = ashr i32 %x, %n ; <i32> [#uses=1]
- %tmp3 = and i32 1, %tmp29
- %tmp4 = icmp ne i32 %tmp3, 1 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = ashr i32 %x, %n
+ %tmp3 = and i32 1, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, 1
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB24_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB24_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp ne i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3b(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3b:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB25_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB25_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3b
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp ne i32 %tmp3, %tmp29 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp3, %tmp29
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3x(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3x:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB26_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB26_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3x
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %tmp29, %x ; <i32> [#uses=1]
- %tmp4 = icmp ne i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %tmp29, %x
+ %tmp4 = icmp ne i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
define void @queryne3bx(i32 %x, i32 %n) nounwind {
+; CHECK-LABEL: queryne3bx:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: jb .LBB27_2
+; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .LBB27_2: # %UnifiedReturnBlock
+; CHECK-NEXT: retq
entry:
-; CHECK: queryne3bx
-; CHECK: btl %e{{..}}, %e{{..}}
-; CHECK: jb
- %tmp29 = shl i32 1, %n ; <i32> [#uses=1]
- %tmp3 = and i32 %x, %tmp29
- %tmp4 = icmp ne i32 %tmp29, %tmp3 ; <i1> [#uses=1]
- br i1 %tmp4, label %bb, label %UnifiedReturnBlock
-
-bb: ; preds = %entry
- call void @foo()
- ret void
-
-UnifiedReturnBlock: ; preds = %entry
- ret void
+ %tmp29 = shl i32 1, %n
+ %tmp3 = and i32 %x, %tmp29
+ %tmp4 = icmp ne i32 %tmp29, %tmp3
+ br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:
+ call void @foo()
+ ret void
+
+UnifiedReturnBlock:
+ ret void
}
declare void @foo()
define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
-; CHECK: btl
-entry:
+; CHECK-LABEL: invert:
+; CHECK: # BB#0:
+; CHECK-NEXT: notl %edi
+; CHECK-NEXT: btl %esi, %edi
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: retq
%neg = xor i32 %flags, -1
%shl = shl i32 1, %flag
%and = and i32 %shl, %neg
%tobool = icmp ne i32 %and, 0
ret i1 %tobool
}
+
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
index fd7290d58179..2ee33a1a9028 100644
--- a/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -47,7 +47,7 @@ entry:
define <2 x double> @test_negative_zero_2(<2 x double> %A) {
; CHECK-LABEL: test_negative_zero_2:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movhpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: retq
entry:
%0 = extractelement <2 x double> %A, i32 0
diff --git a/test/CodeGen/X86/byval2.ll b/test/CodeGen/X86/byval2.ll
index cc72a8699a9c..5eb8b590e8da 100644
--- a/test/CodeGen/X86/byval2.ll
+++ b/test/CodeGen/X86/byval2.ll
@@ -37,8 +37,8 @@ entry:
store i64 %b, i64* %tmp2, align 16
%tmp4 = getelementptr %struct.s, %struct.s* %d, i32 0, i32 2
store i64 %c, i64* %tmp4, align 16
- call void @f( %struct.s*byval %d )
- call void @f( %struct.s*byval %d )
+ call void @f( %struct.s* byval %d )
+ call void @f( %struct.s* byval %d )
ret void
}
diff --git a/test/CodeGen/X86/call-push.ll b/test/CodeGen/X86/call-push.ll
index 6bcb5d665618..e8afa1e77afa 100644
--- a/test/CodeGen/X86/call-push.ll
+++ b/test/CodeGen/X86/call-push.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -disable-fp-elim -no-x86-call-frame-opt | FileCheck %s
%struct.decode_t = type { i8, i8, i8, i8, i16, i8, i8, %struct.range_t** }
%struct.range_t = type { float, float, i32, i32, i32, [0 x i8] }
diff --git a/test/CodeGen/X86/catchpad-dynamic-alloca.ll b/test/CodeGen/X86/catchpad-dynamic-alloca.ll
new file mode 100644
index 000000000000..4e8a8d8868bd
--- /dev/null
+++ b/test/CodeGen/X86/catchpad-dynamic-alloca.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @rt_init()
+
+declare i32 @__CxxFrameHandler3(...)
+
+define void @test1(void ()* %fp, i64 %n) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ %t.i = alloca i8*
+ %t.ii = alloca i8
+ %.alloca8 = alloca i8, i64 %n
+ store volatile i8 0, i8* %t.ii
+ store volatile i8 0, i8* %.alloca8
+ invoke void @rt_init()
+ to label %try.cont unwind label %catch.switch
+
+try.cont:
+ invoke void %fp()
+ to label %exit unwind label %catch.switch
+
+exit:
+ ret void
+
+catch.pad:
+ %cp = catchpad within %cs [i8* null, i32 0, i8** %t.i]
+ catchret from %cp to label %exit
+
+catch.switch:
+ %cs = catchswitch within none [label %catch.pad] unwind to caller
+}
+
+; CHECK-LABEL: $handlerMap$0$test1:
+; CHECK: .long 0
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 16
+
+define void @test2(void ()* %fp, i64 %n) personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ %t.i = alloca i128
+ %.alloca8 = alloca i8, i64 %n
+ store volatile i8 0, i8* %.alloca8
+ invoke void @rt_init()
+ to label %try.cont unwind label %catch.switch
+
+try.cont:
+ invoke void %fp()
+ to label %exit unwind label %catch.switch
+
+exit:
+ ret void
+
+catch.pad:
+ %cp = catchpad within %cs [i8* null, i32 0, i128* %t.i]
+ catchret from %cp to label %exit
+
+catch.switch:
+ %cs = catchswitch within none [label %catch.pad] unwind to caller
+}
+
+; CHECK-LABEL: $handlerMap$0$test2:
+; CHECK: .long 0
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 8
diff --git a/test/CodeGen/X86/catchpad-lifetime.ll b/test/CodeGen/X86/catchpad-lifetime.ll
index dfd75334561f..77d3f25057cf 100644
--- a/test/CodeGen/X86/catchpad-lifetime.ll
+++ b/test/CodeGen/X86/catchpad-lifetime.ll
@@ -16,7 +16,7 @@ entry:
to label %unreachable unwind label %catch.dispatch
; CHECK-LABEL: test1:
-; CHECK: movq $0, -16(%rbp)
+; CHECK: movq $0, -8(%rbp)
; CHECK: callq throw
catch.dispatch: ; preds = %entry
@@ -33,8 +33,8 @@ catch.pad: ; preds = %catch.dispatch
unreachable
; CHECK-LABEL: "?catch$2@?0?test1@4HA"
-; CHECK: movq $0, -16(%rbp)
-; CHECK: movq $0, -16(%rbp)
+; CHECK: movq $0, -8(%rbp)
+; CHECK: movq $0, -8(%rbp)
; CHECK: ud2
unreachable: ; preds = %entry
@@ -42,7 +42,7 @@ unreachable: ; preds = %entry
}
; CHECK-LABEL: $cppxdata$test1:
-; CHECK: .long 32 # CatchObjOffset
+; CHECK: .long 56 # CatchObjOffset
define void @test2() personality i32 (...)* @__CxxFrameHandler3 {
entry:
diff --git a/test/CodeGen/X86/catchret-regmask.ll b/test/CodeGen/X86/catchret-regmask.ll
new file mode 100644
index 000000000000..1231172a7e95
--- /dev/null
+++ b/test/CodeGen/X86/catchret-regmask.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @throw() noreturn uwtable
+declare i8* @getval()
+
+define i8* @reload_out_of_pad(i8* %arg) #0 personality i32 (...)* @__CxxFrameHandler3 {
+assertPassed:
+ invoke void @throw()
+ to label %unreachable unwind label %catch.dispatch
+
+catch:
+ %cp = catchpad within %cs [i8* null, i32 0, i8* null]
+ catchret from %cp to label %return
+
+ ; This block *must* appear after the catchret to test the bug.
+ ; FIXME: Make this an MIR test so we can control MBB layout.
+unreachable:
+ unreachable
+
+catch.dispatch:
+ %cs = catchswitch within none [label %catch] unwind to caller
+
+return:
+ ret i8* %arg
+}
+
+; CHECK-LABEL: reload_out_of_pad: # @reload_out_of_pad
+; CHECK: movq %rcx, -[[arg_slot:[0-9]+]](%rbp) # 8-byte Spill
+; CHECK: callq throw
+; CHECK: ud2
+; CHECK: movq -[[arg_slot]](%rbp), %rax # 8-byte Reload
+; CHECK: retq
+
+; CHECK: "?catch$3@?0?reload_out_of_pad@4HA":
+; CHECK-NOT: Reload
+; CHECK: retq
+
+define i8* @spill_in_pad() #0 personality i32 (...)* @__CxxFrameHandler3 {
+assertPassed:
+ invoke void @throw()
+ to label %unreachable unwind label %catch.dispatch
+
+catch:
+ %cp = catchpad within %cs [i8* null, i32 0, i8* null]
+ %val = call i8* @getval() [ "funclet"(token %cp) ]
+ catchret from %cp to label %return
+
+unreachable:
+ unreachable
+
+catch.dispatch:
+ %cs = catchswitch within none [label %catch] unwind to caller
+
+return:
+ ret i8* %val
+}
+
+; CHECK-LABEL: spill_in_pad: # @spill_in_pad
+; CHECK: callq throw
+; CHECK: ud2
+; CHECK: movq -[[val_slot:[0-9]+]](%rbp), %rax # 8-byte Reload
+; CHECK: retq
+
+; CHECK: "?catch$3@?0?spill_in_pad@4HA":
+; CHECK: callq getval
+; CHECK: movq %rax, -[[val_slot]](%rbp) # 8-byte Spill
+; CHECK: retq
+
+attributes #0 = { uwtable }
diff --git a/test/CodeGen/X86/cfstring.ll b/test/CodeGen/X86/cfstring.ll
index 3eeb8d2890cc..84032d045fb8 100644
--- a/test/CodeGen/X86/cfstring.ll
+++ b/test/CodeGen/X86/cfstring.ll
@@ -10,7 +10,7 @@
; CHECK-NEXT: L_.str3:
; CHECK: .section __DATA,__cfstring
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: L__unnamed_cfstring_4:
; CHECK-NEXT: .quad ___CFConstantStringClassReference
; CHECK-NEXT: .long 1992
diff --git a/test/CodeGen/X86/cleanuppad-inalloca.ll b/test/CodeGen/X86/cleanuppad-inalloca.ll
index 2e34ada52e6b..c0660fee2f1a 100644
--- a/test/CodeGen/X86/cleanuppad-inalloca.ll
+++ b/test/CodeGen/X86/cleanuppad-inalloca.ll
@@ -38,8 +38,8 @@ ehcleanup: ; preds = %entry
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
; CHECK: calll "??0A@@QAE@XZ"
; CHECK: calll "??0A@@QAE@XZ"
; CHECK: calll _takes_two
diff --git a/test/CodeGen/X86/cleanuppad-realign.ll b/test/CodeGen/X86/cleanuppad-realign.ll
index 5a565cc1570f..314d5da07d72 100644
--- a/test/CodeGen/X86/cleanuppad-realign.ll
+++ b/test/CodeGen/X86/cleanuppad-realign.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc -mtriple=i686-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X64 %s
declare i32 @__CxxFrameHandler3(...)
declare void @Dtor(i64* %o)
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
new file mode 100644
index 000000000000..e05451b80271
--- /dev/null
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -0,0 +1,683 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; PR6455 'Clear Upper Bits' Patterns
+;
+
+define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind {
+; SSE-LABEL: _clearupper2xi64a:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper2xi64a:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <2 x i64> %0, i32 0
+ %x1 = extractelement <2 x i64> %0, i32 1
+ %trunc0 = trunc i64 %x0 to i32
+ %trunc1 = trunc i64 %x1 to i32
+ %ext0 = zext i32 %trunc0 to i64
+ %ext1 = zext i32 %trunc1 to i64
+ %v0 = insertelement <2 x i64> undef, i64 %ext0, i32 0
+ %v1 = insertelement <2 x i64> %v0, i64 %ext1, i32 1
+ ret <2 x i64> %v1
+}
+
+define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind {
+; SSE-LABEL: _clearupper4xi32a:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: _clearupper4xi32a:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _clearupper4xi32a:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <4 x i32> %0, i32 0
+ %x1 = extractelement <4 x i32> %0, i32 1
+ %x2 = extractelement <4 x i32> %0, i32 2
+ %x3 = extractelement <4 x i32> %0, i32 3
+ %trunc0 = trunc i32 %x0 to i16
+ %trunc1 = trunc i32 %x1 to i16
+ %trunc2 = trunc i32 %x2 to i16
+ %trunc3 = trunc i32 %x3 to i16
+ %ext0 = zext i16 %trunc0 to i32
+ %ext1 = zext i16 %trunc1 to i32
+ %ext2 = zext i16 %trunc2 to i32
+ %ext3 = zext i16 %trunc3 to i32
+ %v0 = insertelement <4 x i32> undef, i32 %ext0, i32 0
+ %v1 = insertelement <4 x i32> %v0, i32 %ext1, i32 1
+ %v2 = insertelement <4 x i32> %v1, i32 %ext2, i32 2
+ %v3 = insertelement <4 x i32> %v2, i32 %ext3, i32 3
+ ret <4 x i32> %v3
+}
+
+define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind {
+; SSE-LABEL: _clearupper8xi16a:
+; SSE: # BB#0:
+; SSE-NEXT: pextrw $1, %xmm0, %eax
+; SSE-NEXT: pextrw $2, %xmm0, %r9d
+; SSE-NEXT: pextrw $3, %xmm0, %edx
+; SSE-NEXT: pextrw $4, %xmm0, %r8d
+; SSE-NEXT: pextrw $5, %xmm0, %edi
+; SSE-NEXT: pextrw $6, %xmm0, %esi
+; SSE-NEXT: pextrw $7, %xmm0, %ecx
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: movd %edx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: movd %edi, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: movd %esi, %xmm1
+; SSE-NEXT: movd %r9d, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: movd %r8d, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper8xi16a:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrw $1, %xmm0, %eax
+; AVX-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX-NEXT: vpextrw $3, %xmm0, %edx
+; AVX-NEXT: vpextrw $4, %xmm0, %esi
+; AVX-NEXT: vpextrw $5, %xmm0, %edi
+; AVX-NEXT: vpextrw $6, %xmm0, %r8d
+; AVX-NEXT: vpextrw $7, %xmm0, %r9d
+; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $3, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, %r8d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <8 x i16> %0, i32 0
+ %x1 = extractelement <8 x i16> %0, i32 1
+ %x2 = extractelement <8 x i16> %0, i32 2
+ %x3 = extractelement <8 x i16> %0, i32 3
+ %x4 = extractelement <8 x i16> %0, i32 4
+ %x5 = extractelement <8 x i16> %0, i32 5
+ %x6 = extractelement <8 x i16> %0, i32 6
+ %x7 = extractelement <8 x i16> %0, i32 7
+ %trunc0 = trunc i16 %x0 to i8
+ %trunc1 = trunc i16 %x1 to i8
+ %trunc2 = trunc i16 %x2 to i8
+ %trunc3 = trunc i16 %x3 to i8
+ %trunc4 = trunc i16 %x4 to i8
+ %trunc5 = trunc i16 %x5 to i8
+ %trunc6 = trunc i16 %x6 to i8
+ %trunc7 = trunc i16 %x7 to i8
+ %ext0 = zext i8 %trunc0 to i16
+ %ext1 = zext i8 %trunc1 to i16
+ %ext2 = zext i8 %trunc2 to i16
+ %ext3 = zext i8 %trunc3 to i16
+ %ext4 = zext i8 %trunc4 to i16
+ %ext5 = zext i8 %trunc5 to i16
+ %ext6 = zext i8 %trunc6 to i16
+ %ext7 = zext i8 %trunc7 to i16
+ %v0 = insertelement <8 x i16> undef, i16 %ext0, i32 0
+ %v1 = insertelement <8 x i16> %v0, i16 %ext1, i32 1
+ %v2 = insertelement <8 x i16> %v1, i16 %ext2, i32 2
+ %v3 = insertelement <8 x i16> %v2, i16 %ext3, i32 3
+ %v4 = insertelement <8 x i16> %v3, i16 %ext4, i32 4
+ %v5 = insertelement <8 x i16> %v4, i16 %ext5, i32 5
+ %v6 = insertelement <8 x i16> %v5, i16 %ext6, i32 6
+ %v7 = insertelement <8 x i16> %v6, i16 %ext7, i32 7
+ ret <8 x i16> %v7
+}
+
+define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
+; SSE-LABEL: _clearupper16xi8a:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: movd %edx, %xmm0
+; SSE-NEXT: movd %esi, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd %edi, %xmm0
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE-NEXT: movd %edx, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT: movd %r9d, %xmm0
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd %r8d, %xmm0
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper16xi8a:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <16 x i8> %0, i32 0
+ %x1 = extractelement <16 x i8> %0, i32 1
+ %x2 = extractelement <16 x i8> %0, i32 2
+ %x3 = extractelement <16 x i8> %0, i32 3
+ %x4 = extractelement <16 x i8> %0, i32 4
+ %x5 = extractelement <16 x i8> %0, i32 5
+ %x6 = extractelement <16 x i8> %0, i32 6
+ %x7 = extractelement <16 x i8> %0, i32 7
+ %x8 = extractelement <16 x i8> %0, i32 8
+ %x9 = extractelement <16 x i8> %0, i32 9
+ %x10 = extractelement <16 x i8> %0, i32 10
+ %x11 = extractelement <16 x i8> %0, i32 11
+ %x12 = extractelement <16 x i8> %0, i32 12
+ %x13 = extractelement <16 x i8> %0, i32 13
+ %x14 = extractelement <16 x i8> %0, i32 14
+ %x15 = extractelement <16 x i8> %0, i32 15
+ %trunc0 = trunc i8 %x0 to i4
+ %trunc1 = trunc i8 %x1 to i4
+ %trunc2 = trunc i8 %x2 to i4
+ %trunc3 = trunc i8 %x3 to i4
+ %trunc4 = trunc i8 %x4 to i4
+ %trunc5 = trunc i8 %x5 to i4
+ %trunc6 = trunc i8 %x6 to i4
+ %trunc7 = trunc i8 %x7 to i4
+ %trunc8 = trunc i8 %x8 to i4
+ %trunc9 = trunc i8 %x9 to i4
+ %trunc10 = trunc i8 %x10 to i4
+ %trunc11 = trunc i8 %x11 to i4
+ %trunc12 = trunc i8 %x12 to i4
+ %trunc13 = trunc i8 %x13 to i4
+ %trunc14 = trunc i8 %x14 to i4
+ %trunc15 = trunc i8 %x15 to i4
+ %ext0 = zext i4 %trunc0 to i8
+ %ext1 = zext i4 %trunc1 to i8
+ %ext2 = zext i4 %trunc2 to i8
+ %ext3 = zext i4 %trunc3 to i8
+ %ext4 = zext i4 %trunc4 to i8
+ %ext5 = zext i4 %trunc5 to i8
+ %ext6 = zext i4 %trunc6 to i8
+ %ext7 = zext i4 %trunc7 to i8
+ %ext8 = zext i4 %trunc8 to i8
+ %ext9 = zext i4 %trunc9 to i8
+ %ext10 = zext i4 %trunc10 to i8
+ %ext11 = zext i4 %trunc11 to i8
+ %ext12 = zext i4 %trunc12 to i8
+ %ext13 = zext i4 %trunc13 to i8
+ %ext14 = zext i4 %trunc14 to i8
+ %ext15 = zext i4 %trunc15 to i8
+ %v0 = insertelement <16 x i8> undef, i8 %ext0, i32 0
+ %v1 = insertelement <16 x i8> %v0, i8 %ext1, i32 1
+ %v2 = insertelement <16 x i8> %v1, i8 %ext2, i32 2
+ %v3 = insertelement <16 x i8> %v2, i8 %ext3, i32 3
+ %v4 = insertelement <16 x i8> %v3, i8 %ext4, i32 4
+ %v5 = insertelement <16 x i8> %v4, i8 %ext5, i32 5
+ %v6 = insertelement <16 x i8> %v5, i8 %ext6, i32 6
+ %v7 = insertelement <16 x i8> %v6, i8 %ext7, i32 7
+ %v8 = insertelement <16 x i8> %v7, i8 %ext8, i32 8
+ %v9 = insertelement <16 x i8> %v8, i8 %ext9, i32 9
+ %v10 = insertelement <16 x i8> %v9, i8 %ext10, i32 10
+ %v11 = insertelement <16 x i8> %v10, i8 %ext11, i32 11
+ %v12 = insertelement <16 x i8> %v11, i8 %ext12, i32 12
+ %v13 = insertelement <16 x i8> %v12, i8 %ext13, i32 13
+ %v14 = insertelement <16 x i8> %v13, i8 %ext14, i32 14
+ %v15 = insertelement <16 x i8> %v14, i8 %ext15, i32 15
+ ret <16 x i8> %v15
+}
+
+define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
+; SSE-LABEL: _clearupper2xi64b:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: _clearupper2xi64b:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _clearupper2xi64b:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: retq
+ %x32 = bitcast <2 x i64> %0 to <4 x i32>
+ %r0 = insertelement <4 x i32> %x32, i32 zeroinitializer, i32 1
+ %r1 = insertelement <4 x i32> %r0, i32 zeroinitializer, i32 3
+ %r = bitcast <4 x i32> %r1 to <2 x i64>
+ ret <2 x i64> %r
+}
+
+define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind {
+; SSE-LABEL: _clearupper4xi32b:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: pinsrw $1, %eax, %xmm0
+; SSE-NEXT: pinsrw $3, %eax, %xmm0
+; SSE-NEXT: pinsrw $5, %eax, %xmm0
+; SSE-NEXT: pinsrw $7, %eax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper4xi32b:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: retq
+ %x16 = bitcast <4 x i32> %0 to <8 x i16>
+ %r0 = insertelement <8 x i16> %x16, i16 zeroinitializer, i32 1
+ %r1 = insertelement <8 x i16> %r0, i16 zeroinitializer, i32 3
+ %r2 = insertelement <8 x i16> %r1, i16 zeroinitializer, i32 5
+ %r3 = insertelement <8 x i16> %r2, i16 zeroinitializer, i32 7
+ %r = bitcast <8 x i16> %r3 to <4 x i32>
+ ret <4 x i32> %r
+}
+
+define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
+; SSE-LABEL: _clearupper8xi16b:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllw $8, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslld $24, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllq $40, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psllq $56, %xmm3
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
+; SSE-NEXT: pandn %xmm3, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE-NEXT: pandn %xmm1, %xmm2
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper8xi16b:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x8 = bitcast <8 x i16> %0 to <16 x i8>
+ %r0 = insertelement <16 x i8> %x8, i8 zeroinitializer, i32 1
+ %r1 = insertelement <16 x i8> %r0, i8 zeroinitializer, i32 3
+ %r2 = insertelement <16 x i8> %r1, i8 zeroinitializer, i32 5
+ %r3 = insertelement <16 x i8> %r2, i8 zeroinitializer, i32 7
+ %r4 = insertelement <16 x i8> %r3, i8 zeroinitializer, i32 9
+ %r5 = insertelement <16 x i8> %r4, i8 zeroinitializer, i32 11
+ %r6 = insertelement <16 x i8> %r5, i8 zeroinitializer, i32 13
+ %r7 = insertelement <16 x i8> %r6, i8 zeroinitializer, i32 15
+ %r = bitcast <16 x i8> %r7 to <8 x i16>
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
+; SSE-LABEL: _clearupper16xi8b:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT: movd %xmm0, %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: movq %rcx, %r9
+; SSE-NEXT: movq %rcx, %r10
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: movq %rcx, %rdx
+; SSE-NEXT: movq %rcx, %rsi
+; SSE-NEXT: movq %rcx, %rdi
+; SSE-NEXT: andb $15, %cl
+; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movd %xmm1, %rcx
+; SSE-NEXT: shrq $56, %rdi
+; SSE-NEXT: andb $15, %dil
+; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %r11
+; SSE-NEXT: shrq $48, %rsi
+; SSE-NEXT: andb $15, %sil
+; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %r14
+; SSE-NEXT: shrq $40, %rdx
+; SSE-NEXT: andb $15, %dl
+; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rdx
+; SSE-NEXT: shrq $32, %rax
+; SSE-NEXT: andb $15, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: shrq $24, %r10
+; SSE-NEXT: andb $15, %r10b
+; SSE-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rdi
+; SSE-NEXT: shrq $16, %r9
+; SSE-NEXT: andb $15, %r9b
+; SSE-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rsi
+; SSE-NEXT: shrq $8, %r8
+; SSE-NEXT: andb $15, %r8b
+; SSE-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, %rbx
+; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: andb $15, %cl
+; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $56, %rbx
+; SSE-NEXT: andb $15, %bl
+; SSE-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $48, %rsi
+; SSE-NEXT: andb $15, %sil
+; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $40, %rdi
+; SSE-NEXT: andb $15, %dil
+; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $32, %rax
+; SSE-NEXT: andb $15, %al
+; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $24, %rdx
+; SSE-NEXT: andb $15, %dl
+; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $16, %r14
+; SSE-NEXT: andb $15, %r14b
+; SSE-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: shrq $8, %r11
+; SSE-NEXT: andb $15, %r11b
+; SSE-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper16xi8b:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT: movq %rcx, %r8
+; AVX-NEXT: movq %rcx, %r9
+; AVX-NEXT: movq %rcx, %r10
+; AVX-NEXT: movq %rcx, %r11
+; AVX-NEXT: movq %rcx, %r14
+; AVX-NEXT: movq %rcx, %r15
+; AVX-NEXT: movq %rdx, %r12
+; AVX-NEXT: movq %rdx, %r13
+; AVX-NEXT: movq %rdx, %rdi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: movq %rdx, %rbx
+; AVX-NEXT: movq %rdx, %rbp
+; AVX-NEXT: andb $15, %dl
+; AVX-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq %rcx, %rdx
+; AVX-NEXT: andb $15, %cl
+; AVX-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $56, %rbp
+; AVX-NEXT: andb $15, %bpl
+; AVX-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $48, %rbx
+; AVX-NEXT: andb $15, %bl
+; AVX-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $40, %rsi
+; AVX-NEXT: andb $15, %sil
+; AVX-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $32, %rax
+; AVX-NEXT: andb $15, %al
+; AVX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $24, %rdi
+; AVX-NEXT: andb $15, %dil
+; AVX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $16, %r13
+; AVX-NEXT: andb $15, %r13b
+; AVX-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $8, %r12
+; AVX-NEXT: andb $15, %r12b
+; AVX-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $56, %rdx
+; AVX-NEXT: andb $15, %dl
+; AVX-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $48, %r15
+; AVX-NEXT: andb $15, %r15b
+; AVX-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $40, %r14
+; AVX-NEXT: andb $15, %r14b
+; AVX-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $32, %r11
+; AVX-NEXT: andb $15, %r11b
+; AVX-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $24, %r10
+; AVX-NEXT: andb $15, %r10b
+; AVX-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $16, %r9
+; AVX-NEXT: andb $15, %r9b
+; AVX-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: shrq $8, %r8
+; AVX-NEXT: andb $15, %r8b
+; AVX-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r12
+; AVX-NEXT: popq %r13
+; AVX-NEXT: popq %r14
+; AVX-NEXT: popq %r15
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: retq
+ %x4 = bitcast <16 x i8> %0 to <32 x i4>
+ %r0 = insertelement <32 x i4> %x4, i4 zeroinitializer, i32 1
+ %r1 = insertelement <32 x i4> %r0, i4 zeroinitializer, i32 3
+ %r2 = insertelement <32 x i4> %r1, i4 zeroinitializer, i32 5
+ %r3 = insertelement <32 x i4> %r2, i4 zeroinitializer, i32 7
+ %r4 = insertelement <32 x i4> %r3, i4 zeroinitializer, i32 9
+ %r5 = insertelement <32 x i4> %r4, i4 zeroinitializer, i32 11
+ %r6 = insertelement <32 x i4> %r5, i4 zeroinitializer, i32 13
+ %r7 = insertelement <32 x i4> %r6, i4 zeroinitializer, i32 15
+ %r8 = insertelement <32 x i4> %r7, i4 zeroinitializer, i32 17
+ %r9 = insertelement <32 x i4> %r8, i4 zeroinitializer, i32 19
+ %r10 = insertelement <32 x i4> %r9, i4 zeroinitializer, i32 21
+ %r11 = insertelement <32 x i4> %r10, i4 zeroinitializer, i32 23
+ %r12 = insertelement <32 x i4> %r11, i4 zeroinitializer, i32 25
+ %r13 = insertelement <32 x i4> %r12, i4 zeroinitializer, i32 27
+ %r14 = insertelement <32 x i4> %r13, i4 zeroinitializer, i32 29
+ %r15 = insertelement <32 x i4> %r14, i4 zeroinitializer, i32 31
+ %r = bitcast <32 x i4> %r15 to <16 x i8>
+ ret <16 x i8> %r
+}
+
+define <2 x i64> @_clearupper2xi64c(<2 x i64>) nounwind {
+; SSE-LABEL: _clearupper2xi64c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: _clearupper2xi64c:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _clearupper2xi64c:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: retq
+ %r = and <2 x i64> <i64 4294967295, i64 4294967295>, %0
+ ret <2 x i64> %r
+}
+
+define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind {
+; SSE-LABEL: _clearupper4xi32c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper4xi32c:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX-NEXT: retq
+ %r = and <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, %0
+ ret <4 x i32> %r
+}
+
+define <8 x i16> @_clearupper8xi16c(<8 x i16>) nounwind {
+; SSE-LABEL: _clearupper8xi16c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper8xi16c:
+; AVX: # BB#0:
+; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %r = and <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>, %0
+ ret <8 x i16> %r
+}
+
+define <16 x i8> @_clearupper16xi8c(<16 x i8>) nounwind {
+; SSE-LABEL: _clearupper16xi8c:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _clearupper16xi8c:
+; AVX: # BB#0:
+; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %r = and <16 x i8> <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>, %0
+ ret <16 x i8> %r
+}
diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll
index 4a094480c931..685b2588bf52 100644
--- a/test/CodeGen/X86/clz.ll
+++ b/test/CodeGen/X86/clz.ll
@@ -1,160 +1,753 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X32-CLZ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=X64-CLZ
declare i8 @llvm.cttz.i8(i8, i1)
declare i16 @llvm.cttz.i16(i16, i1)
declare i32 @llvm.cttz.i32(i32, i1)
declare i64 @llvm.cttz.i64(i64, i1)
+
declare i8 @llvm.ctlz.i8(i8, i1)
declare i16 @llvm.ctlz.i16(i16, i1)
declare i32 @llvm.ctlz.i32(i32, i1)
declare i64 @llvm.ctlz.i64(i64, i1)
define i8 @cttz_i8(i8 %x) {
-; CHECK-LABEL: cttz_i8:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: bsfl %eax, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: bsfl %eax, %eax
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsfl %eax, %eax
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i8:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i8:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: tzcntl %eax, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
%tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
ret i8 %tmp
}
define i16 @cttz_i16(i16 %x) {
-; CHECK-LABEL: cttz_i16:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsfw %di, %ax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i16:
+; X32: # BB#0:
+; X32-NEXT: bsfw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i16:
+; X64: # BB#0:
+; X64-NEXT: bsfw %di, %ax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i16:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i16:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntw %di, %ax
+; X64-CLZ-NEXT: retq
%tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true )
ret i16 %tmp
}
define i32 @cttz_i32(i32 %x) {
-; CHECK-LABEL: cttz_i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsfl %edi, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i32:
+; X32: # BB#0:
+; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i32:
+; X64: # BB#0:
+; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i32:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i32:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true )
ret i32 %tmp
}
define i64 @cttz_i64(i64 %x) {
-; CHECK-LABEL: cttz_i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsfq %rdi, %rax
-; CHECK-NEXT: retq
+; X32-LABEL: cttz_i64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testl %eax, %eax
+; X32-NEXT: jne .LBB3_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB3_1:
+; X32-NEXT: bsfl %eax, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i64:
+; X64: # BB#0:
+; X64-NEXT: bsfq %rdi, %rax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i64:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB3_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB3_1:
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i64:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
%tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true )
ret i64 %tmp
}
define i8 @ctlz_i8(i8 %x) {
-; CHECK-LABEL: ctlz_i8:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: bsrl %eax, %eax
-; CHECK-NEXT: xorl $7, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: bsrl %eax, %eax
+; X32-NEXT: xorl $7, %eax
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsrl %eax, %eax
+; X64-NEXT: xorl $7, %eax
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i8:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: addl $-24, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i8:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: lzcntl %eax, %eax
+; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
%tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true )
ret i8 %tmp2
}
define i16 @ctlz_i16(i16 %x) {
-; CHECK-LABEL: ctlz_i16:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrw %di, %ax
-; CHECK-NEXT: xorl $15, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i16:
+; X32: # BB#0:
+; X32-NEXT: bsrw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: xorl $15, %eax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i16:
+; X64: # BB#0:
+; X64-NEXT: bsrw %di, %ax
+; X64-NEXT: xorl $15, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i16:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i16:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntw %di, %ax
+; X64-CLZ-NEXT: retq
%tmp2 = call i16 @llvm.ctlz.i16( i16 %x, i1 true )
ret i16 %tmp2
}
define i32 @ctlz_i32(i32 %x) {
-; CHECK-LABEL: ctlz_i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i32:
+; X32: # BB#0:
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i32:
+; X64: # BB#0:
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i32:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i32:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 true )
ret i32 %tmp
}
define i64 @ctlz_i64(i64 %x) {
-; CHECK-LABEL: ctlz_i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrq %rdi, %rax
-; CHECK-NEXT: xorq $63, %rax
-; CHECK-NEXT: retq
+; X32-LABEL: ctlz_i64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testl %eax, %eax
+; X32-NEXT: jne .LBB7_1
+; X32-NEXT: # BB#2:
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB7_1:
+; X32-NEXT: bsrl %eax, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i64:
+; X64: # BB#0:
+; X64-NEXT: bsrq %rdi, %rax
+; X64-NEXT: xorq $63, %rax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i64:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB7_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB7_1:
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i64:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
%tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true )
ret i64 %tmp
}
-define i32 @ctlz_i32_zero_test(i32 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i8 @ctlz_i8_zero_test(i8 %n) {
+; X32-LABEL: ctlz_i8_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movb $8, %al
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: je .LBB8_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: bsrl %eax, %eax
+; X32-NEXT: xorl $7, %eax
+; X32-NEXT: .LBB8_2: # %cond.end
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i8_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movb $8, %al
+; X64-NEXT: testb %dil, %dil
+; X64-NEXT: je .LBB8_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsrl %eax, %eax
+; X64-NEXT: xorl $7, %eax
+; X64-NEXT: .LBB8_2: # %cond.end
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i8_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: addl $-24, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i8_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: lzcntl %eax, %eax
+; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false)
+ ret i8 %tmp1
+}
-; CHECK-LABEL: ctlz_i32_zero_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: je .LBB8_2
-; CHECK-NEXT: # BB#1: # %cond.false
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: .LBB8_2: # %cond.end
-; CHECK-NEXT: retq
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i16 @ctlz_i16_zero_test(i16 %n) {
+; X32-LABEL: ctlz_i16_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: testw %cx, %cx
+; X32-NEXT: je .LBB9_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrw %cx, %ax
+; X32-NEXT: xorl $15, %eax
+; X32-NEXT: .LBB9_2: # %cond.end
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i16_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %ax
+; X64-NEXT: testw %di, %di
+; X64-NEXT: je .LBB9_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrw %di, %ax
+; X64-NEXT: xorl $15, %eax
+; X64-NEXT: .LBB9_2: # %cond.end
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i16_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i16_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntw %di, %ax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i16 @llvm.ctlz.i16(i16 %n, i1 false)
+ ret i16 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i32 @ctlz_i32_zero_test(i32 %n) {
+; X32-LABEL: ctlz_i32_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: je .LBB10_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: .LBB10_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i32_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: je .LBB10_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: .LBB10_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i32_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i32_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
ret i32 %tmp1
}
-define i32 @ctlz_i32_fold_cmov(i32 %n) {
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i64 @ctlz_i64_zero_test(i64 %n) {
+; X32-LABEL: ctlz_i64_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl $63, %eax
+; X32-NEXT: je .LBB11_2
+; X32-NEXT: # BB#1:
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: .LBB11_2:
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: jne .LBB11_3
+; X32-NEXT: # BB#4:
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB11_3:
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i64_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %eax
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: je .LBB11_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrq %rdi, %rax
+; X64-NEXT: xorq $63, %rax
+; X64-NEXT: .LBB11_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i64_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB11_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB11_1:
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i64_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i64 @llvm.ctlz.i64(i64 %n, i1 false)
+ ret i64 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i8 @cttz_i8_zero_test(i8 %n) {
+; X32-LABEL: cttz_i8_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movb $8, %al
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: je .LBB12_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: bsfl %eax, %eax
+; X32-NEXT: .LBB12_2: # %cond.end
+; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i8_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movb $8, %al
+; X64-NEXT: testb %dil, %dil
+; X64-NEXT: je .LBB12_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: bsfl %eax, %eax
+; X64-NEXT: .LBB12_2: # %cond.end
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i8_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: orl $256, %eax # imm = 0x100
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i8_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: orl $256, %eax # imm = 0x100
+; X64-CLZ-NEXT: tzcntl %eax, %eax
+; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)
+ ret i8 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i16 @cttz_i16_zero_test(i16 %n) {
+; X32-LABEL: cttz_i16_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movw $16, %ax
+; X32-NEXT: testw %cx, %cx
+; X32-NEXT: je .LBB13_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsfw %cx, %ax
+; X32-NEXT: .LBB13_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i16_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movw $16, %ax
+; X64-NEXT: testw %di, %di
+; X64-NEXT: je .LBB13_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsfw %di, %ax
+; X64-NEXT: .LBB13_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i16_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i16_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntw %di, %ax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i16 @llvm.cttz.i16(i16 %n, i1 false)
+ ret i16 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i32 @cttz_i32_zero_test(i32 %n) {
+; X32-LABEL: cttz_i32_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: je .LBB14_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsfl %ecx, %eax
+; X32-NEXT: .LBB14_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i32_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: je .LBB14_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsfl %edi, %eax
+; X64-NEXT: .LBB14_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i32_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i32_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i32 @llvm.cttz.i32(i32 %n, i1 false)
+ ret i32 %tmp1
+}
+
+; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
+define i64 @cttz_i64_zero_test(i64 %n) {
+; X32-LABEL: cttz_i64_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: bsfl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: je .LBB15_2
+; X32-NEXT: # BB#1:
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: .LBB15_2:
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: jne .LBB15_3
+; X32-NEXT: # BB#4:
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: .LBB15_3:
+; X32-NEXT: bsfl %ecx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: cttz_i64_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $64, %eax
+; X64-NEXT: testq %rdi, %rdi
+; X64-NEXT: je .LBB15_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsfq %rdi, %rax
+; X64-NEXT: .LBB15_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: cttz_i64_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: testl %eax, %eax
+; X32-CLZ-NEXT: jne .LBB15_1
+; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: addl $32, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+; X32-CLZ-NEXT: .LBB15_1:
+; X32-CLZ-NEXT: tzcntl %eax, %eax
+; X32-CLZ-NEXT: xorl %edx, %edx
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: cttz_i64_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: tzcntq %rdi, %rax
+; X64-CLZ-NEXT: retq
+ %tmp1 = call i64 @llvm.cttz.i64(i64 %n, i1 false)
+ ret i64 %tmp1
+}
+
; Don't generate the cmovne when the source is known non-zero (and bsr would
; not set ZF).
; rdar://9490949
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
; codegen doesn't know how to delete the movl and je.
-
-; CHECK-LABEL: ctlz_i32_fold_cmov:
-; CHECK: # BB#0:
-; CHECK-NEXT: orl $1, %edi
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: je .LBB9_2
-; CHECK-NEXT: # BB#1: # %cond.false
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: .LBB9_2: # %cond.end
-; CHECK-NEXT: retq
+define i32 @ctlz_i32_fold_cmov(i32 %n) {
+; X32-LABEL: ctlz_i32_fold_cmov:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: orl $1, %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: je .LBB16_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: .LBB16_2: # %cond.end
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_i32_fold_cmov:
+; X64: # BB#0:
+; X64-NEXT: orl $1, %edi
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: je .LBB16_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: .LBB16_2: # %cond.end
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_i32_fold_cmov:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: orl $1, %eax
+; X32-CLZ-NEXT: lzcntl %eax, %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_i32_fold_cmov:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: orl $1, %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: retq
%or = or i32 %n, 1
%tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)
ret i32 %tmp1
}
-define i32 @ctlz_bsr(i32 %n) {
; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
; the most significant bit, which is what 'bsr' does natively.
-
-; CHECK-LABEL: ctlz_bsr:
-; CHECK: # BB#0:
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: retq
+; FIXME: We should probably select BSR instead of LZCNT in these circumstances.
+define i32 @ctlz_bsr(i32 %n) {
+; X32-LABEL: ctlz_bsr:
+; X32: # BB#0:
+; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_bsr:
+; X64: # BB#0:
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_bsr:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: xorl $31, %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_bsr:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: xorl $31, %eax
+; X64-CLZ-NEXT: retq
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
%bsr = xor i32 %ctlz, 31
ret i32 %bsr
}
-define i32 @ctlz_bsr_zero_test(i32 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
; codegen doesn't know how to combine the $32 and $31 into $63.
-
-; CHECK-LABEL: ctlz_bsr_zero_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: je .LBB11_2
-; CHECK-NEXT: # BB#1: # %cond.false
-; CHECK-NEXT: bsrl %edi, %eax
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: .LBB11_2: # %cond.end
-; CHECK-NEXT: xorl $31, %eax
-; CHECK-NEXT: retq
+define i32 @ctlz_bsr_zero_test(i32 %n) {
+; X32-LABEL: ctlz_bsr_zero_test:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: testl %ecx, %ecx
+; X32-NEXT: je .LBB18_2
+; X32-NEXT: # BB#1: # %cond.false
+; X32-NEXT: bsrl %ecx, %eax
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: .LBB18_2: # %cond.end
+; X32-NEXT: xorl $31, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: ctlz_bsr_zero_test:
+; X64: # BB#0:
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: testl %edi, %edi
+; X64-NEXT: je .LBB18_2
+; X64-NEXT: # BB#1: # %cond.false
+; X64-NEXT: bsrl %edi, %eax
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: .LBB18_2: # %cond.end
+; X64-NEXT: xorl $31, %eax
+; X64-NEXT: retq
+;
+; X32-CLZ-LABEL: ctlz_bsr_zero_test:
+; X32-CLZ: # BB#0:
+; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
+; X32-CLZ-NEXT: xorl $31, %eax
+; X32-CLZ-NEXT: retl
+;
+; X64-CLZ-LABEL: ctlz_bsr_zero_test:
+; X64-CLZ: # BB#0:
+; X64-CLZ-NEXT: lzcntl %edi, %eax
+; X64-CLZ-NEXT: xorl $31, %eax
+; X64-CLZ-NEXT: retq
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
%bsr = xor i32 %ctlz, 31
ret i32 %bsr
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
index 909440800a56..acb5a2bb51f1 100644
--- a/test/CodeGen/X86/cmov-into-branch.ll
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -1,63 +1,135 @@
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
-; cmp with single-use load, should not form cmov.
+; cmp with single-use load, should not form branch.
define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: ucomisd (%rdi), %xmm0
+; CHECK-NEXT: cmovbel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+;
%load = load double, double* %b, align 8
%cmp = fcmp olt double %load, %a
%cond = select i1 %cmp, i32 %x, i32 %y
ret i32 %cond
-; CHECK-LABEL: test1:
-; CHECK: ucomisd
-; CHECK-NOT: cmov
-; CHECK: j
-; CHECK-NOT: cmov
}
; Sanity check: no load.
define i32 @test2(double %a, double %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: cmovbel %esi, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+;
%cmp = fcmp ogt double %a, %b
%cond = select i1 %cmp, i32 %x, i32 %y
ret i32 %cond
-; CHECK-LABEL: test2:
-; CHECK: ucomisd
-; CHECK: cmov
-}
-
-; Multiple uses of %a, should not form cmov.
-define i32 @test3(i32 %a, i32* nocapture %b, i32 %x) {
- %load = load i32, i32* %b, align 4
- %cmp = icmp ult i32 %load, %a
- %cond = select i1 %cmp, i32 %a, i32 %x
- ret i32 %cond
-; CHECK-LABEL: test3:
-; CHECK: cmpl
-; CHECK-NOT: cmov
-; CHECK: j
-; CHECK-NOT: cmov
}
; Multiple uses of the load.
define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl (%rsi), %eax
+; CHECK-NEXT: cmpl %edi, %eax
+; CHECK-NEXT: cmovael %ecx, %edx
+; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: retq
+;
%load = load i32, i32* %b, align 4
%cmp = icmp ult i32 %load, %a
%cond = select i1 %cmp, i32 %x, i32 %y
%add = add i32 %cond, %load
ret i32 %add
-; CHECK-LABEL: test4:
-; CHECK: cmpl
-; CHECK: cmov
}
; Multiple uses of the cmp.
define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpl %edi, (%rsi)
+; CHECK-NEXT: cmoval %edi, %ecx
+; CHECK-NEXT: cmovael %edx, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retq
+;
%load = load i32, i32* %b, align 4
%cmp = icmp ult i32 %load, %a
%cmp1 = icmp ugt i32 %load, %a
%cond = select i1 %cmp1, i32 %a, i32 %y
%cond5 = select i1 %cmp, i32 %cond, i32 %x
ret i32 %cond5
-; CHECK-LABEL: test5:
-; CHECK: cmpl
-; CHECK: cmov
-; CHECK: cmov
}
+
+; If a select is not obviously predictable, don't turn it into a branch.
+define i32 @weighted_select1(i32 %a, i32 %b) {
+; CHECK-LABEL: weighted_select1:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cmovnel %edi, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
+ ret i32 %sel
+}
+
+; If a select is obviously predictable, turn it into a branch.
+define i32 @weighted_select2(i32 %a, i32 %b) {
+; CHECK-LABEL: weighted_select2:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jne [[LABEL_BB5:.*]]
+; CHECK: movl %esi, %edi
+; CHECK-NEXT: [[LABEL_BB5]]
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1
+ ret i32 %sel
+}
+
+; Note the reversed profile weights: it doesn't matter if it's
+; obviously true or obviously false.
+; Either one should become a branch rather than conditional move.
+; TODO: But likely true vs. likely false should affect basic block placement?
+define i32 @weighted_select3(i32 %a, i32 %b) {
+; CHECK-LABEL: weighted_select3:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jne [[LABEL_BB6:.*]]
+; CHECK: movl %esi, %edi
+; CHECK-NEXT: [[LABEL_BB6]]
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
+ ret i32 %sel
+}
+
+; Weightlessness is no reason to die.
+define i32 @unweighted_select(i32 %a, i32 %b) {
+; CHECK-LABEL: unweighted_select:
+; CHECK: # BB#0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cmovnel %edi, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+;
+ %cmp = icmp ne i32 %a, 0
+ %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3
+ ret i32 %sel
+}
+
+!0 = !{!"branch_weights", i32 1, i32 99}
+!1 = !{!"branch_weights", i32 1, i32 100}
+!2 = !{!"branch_weights", i32 100, i32 1}
+!3 = !{!"branch_weights", i32 0, i32 0}
+
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index f2f36b15d0c5..9acc9ea4fb18 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -120,8 +120,8 @@ declare i32 @printf(i8* nocapture, ...) nounwind
define i32 @test5(i32* nocapture %P) nounwind readonly {
entry:
; CHECK-LABEL: test5:
+; CHECK: xorl %eax, %eax
; CHECK: setg %al
-; CHECK: movzbl %al, %eax
; CHECK: orl $-2, %eax
; CHECK: ret
@@ -134,8 +134,8 @@ entry:
define i32 @test6(i32* nocapture %P) nounwind readonly {
entry:
; CHECK-LABEL: test6:
+; CHECK: xorl %eax, %eax
; CHECK: setl %al
-; CHECK: movzbl %al, %eax
; CHECK: leal 4(%rax,%rax,8), %eax
; CHECK: ret
%0 = load i32, i32* %P, align 4 ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll
index 9363d31866d7..38ba308ecff5 100644
--- a/test/CodeGen/X86/cmovcmov.ll
+++ b/test/CodeGen/X86/cmovcmov.ll
@@ -250,14 +250,14 @@ attributes #0 = { nounwind }
; CMOV-DAG: movb $20, %al
; CMOV-DAG: movb $20, %dl
; CMOV: jl [[BB0:.LBB[0-9_]+]]
-; CMOV: movb %cl, %dl
+; CMOV: movl %ecx, %edx
; CMOV: [[BB0]]:
; CMOV: jg [[BB1:.LBB[0-9_]+]]
-; CMOV: movb %dl, %al
+; CMOV: movl %edx, %eax
; CMOV: [[BB1]]:
; CMOV: testl %edi, %edi
; CMOV: je [[BB2:.LBB[0-9_]+]]
-; CMOV: movb %dl, %al
+; CMOV: movl %edx, %eax
; CMOV: [[BB2]]:
; CMOV: movb %al, g8(%rip)
; CMOV: retq
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index eb9a29011428..d24f27ddf22c 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -49,9 +49,9 @@ define i64 @test3(i64 %x) nounwind {
%r = zext i1 %t to i64
ret i64 %r
; CHECK-LABEL: test3:
+; CHECK: xorl %eax, %eax
; CHECK: testq %rdi, %rdi
; CHECK: sete %al
-; CHECK: movzbl %al, %eax
; CHECK: ret
}
@@ -60,9 +60,9 @@ define i64 @test4(i64 %x) nounwind {
%r = zext i1 %t to i64
ret i64 %r
; CHECK-LABEL: test4:
+; CHECK: xorl %eax, %eax
; CHECK: testq %rdi, %rdi
; CHECK: setle %al
-; CHECK: movzbl %al, %eax
; CHECK: ret
}
@@ -255,3 +255,30 @@ define zeroext i1 @test19(i32 %L) {
; CHECK: testl %edi, %edi
; CHECK: setns %al
}
+
+@d = global i8 0, align 1
+
+; This test failed due to incorrect handling of "shift + icmp" sequence
+define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
+ %bf.shl = shl i32 %bf.load, 8
+ %bf.ashr = ashr exact i32 %bf.shl, 8
+ %tobool4 = icmp ne i32 %bf.ashr, 0
+ %conv = zext i1 %tobool4 to i32
+ %conv6 = zext i8 %x1 to i32
+ %add = add nuw nsw i32 %conv, %conv6
+ %tobool7 = icmp ne i32 %add, 0
+ %frombool = zext i1 %tobool7 to i8
+ store i8 %frombool, i8* %b_addr, align 1
+ %tobool14 = icmp ne i32 %bf.shl, 0
+ %frombool15 = zext i1 %tobool14 to i8
+ store i8 %frombool15, i8* @d, align 1
+ ret void
+
+; CHECK-LABEL: test20
+; CHECK: andl
+; CHECK: setne
+; CHECK: addl
+; CHECK: setne
+; CHECK: testl
+; CHECK: setne
+} \ No newline at end of file
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index 1665360e4990..f2b9dee91037 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -21,9 +21,11 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
; i386-NEXT: lahf
; i386-NEXT: movl %eax, [[FLAGS:%.*]]
; i386-NEXT: popl %eax
-; i386-NEXT: movl %edx, 4(%esp)
-; i386-NEXT: movl %eax, (%esp)
+; i386-NEXT: subl $8, %esp
+; i386-NEXT: pushl %edx
+; i386-NEXT: pushl %eax
; i386-NEXT: calll bar
+; i386-NEXT: addl $16, %esp
; i386-NEXT: movl [[FLAGS]], %eax
; i386-NEXT: addb $127, %al
; i386-NEXT: sahf
@@ -61,11 +63,10 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: movq %rax, %rdi
; x8664-sahf-NEXT: callq bar
-; x8664-sahf-NEXT: pushq %rax
+; RAX is dead, no need to push and pop it.
; x8664-sahf-NEXT: movq [[FLAGS]], %rax
; x8664-sahf-NEXT: addb $127, %al
; x8664-sahf-NEXT: sahf
-; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: jne
%cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
@@ -166,11 +167,10 @@ define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
; x8664-sahf-LABEL: test_feed_cmov:
; x8664-sahf: cmpxchgl
-; x8664-sahf: pushq %rax
+; RAX is dead, do not push or pop it.
; x8664-sahf-NEXT: seto %al
; x8664-sahf-NEXT: lahf
; x8664-sahf-NEXT: movq %rax, [[FLAGS:%.*]]
-; x8664-sahf-NEXT: popq %rax
; x8664-sahf-NEXT: callq foo
; x8664-sahf-NEXT: pushq %rax
; x8664-sahf-NEXT: movq [[FLAGS]], %rax
diff --git a/test/CodeGen/X86/cmpxchg-i1.ll b/test/CodeGen/X86/cmpxchg-i1.ll
index 5f5869f78bba..97e4472b0890 100644
--- a/test/CodeGen/X86/cmpxchg-i1.ll
+++ b/test/CodeGen/X86/cmpxchg-i1.ll
@@ -34,7 +34,7 @@ define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) {
; CHECK-LABEL: cmpxchg_sext:
; CHECK-DAG: cmpxchgl
; CHECK-NOT: cmpl
-; CHECK: sete %al
+; CHECK: sete %cl
; CHECK: retq
%pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
%success = extractvalue { i32, i1 } %pair, 1
@@ -44,10 +44,10 @@ define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) {
define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) {
; CHECK-LABEL: cmpxchg_zext:
+; CHECK: xorl %e[[R:[a-z]]]x
; CHECK: cmpxchgl
; CHECK-NOT: cmp
-; CHECK: sete [[BYTE:%[a-z0-9]+]]
-; CHECK: movzbl [[BYTE]], %eax
+; CHECK: sete %[[R]]l
%pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
%success = extractvalue { i32, i1 } %pair, 1
%mask = zext i1 %success to i32
diff --git a/test/CodeGen/X86/cmpxchg-i128-i1.ll b/test/CodeGen/X86/cmpxchg-i128-i1.ll
index 278e6a4ed75e..1510b2a49c32 100644
--- a/test/CodeGen/X86/cmpxchg-i128-i1.ll
+++ b/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -44,10 +44,10 @@ define i1 @cmpxchg_arithcmp(i128* %addr, i128 %desired, i128 %new) {
define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) {
; CHECK-LABEL: cmpxchg_zext:
+; CHECK: xorl
; CHECK: cmpxchg16b
; CHECK-NOT: cmpq
-; CHECK: sete [[BYTE:%[a-z0-9]+]]
-; CHECK: movzbl [[BYTE]], %eax
+; CHECK: sete
%pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
%success = extractvalue { i128, i1 } %pair, 1
%mask = zext i1 %success to i128
diff --git a/test/CodeGen/X86/coalescer-commute3.ll b/test/CodeGen/X86/coalescer-commute3.ll
index e5bd448a4158..9f22bf0e1a7a 100644
--- a/test/CodeGen/X86/coalescer-commute3.ll
+++ b/test/CodeGen/X86/coalescer-commute3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | grep mov | count 6
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 -no-x86-call-frame-opt | grep mov | count 6
%struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* }
diff --git a/test/CodeGen/X86/code_placement_align_all.ll b/test/CodeGen/X86/code_placement_align_all.ll
index 53df90620204..11dc59a3bab9 100644
--- a/test/CodeGen/X86/code_placement_align_all.ll
+++ b/test/CodeGen/X86/code_placement_align_all.ll
@@ -1,9 +1,9 @@
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -align-all-blocks=16 < %s | FileCheck %s
;CHECK-LABEL: foo:
-;CHECK: .align 65536, 0x90
-;CHECK: .align 65536, 0x90
-;CHECK: .align 65536, 0x90
+;CHECK: .p2align 16, 0x90
+;CHECK: .p2align 16, 0x90
+;CHECK: .p2align 16, 0x90
;CHECK: ret
define i32 @foo(i32 %t, i32 %l) nounwind readnone ssp uwtable {
%1 = icmp eq i32 %t, 0
diff --git a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
index 592d1ce45bb6..d7dc8defac3a 100644
--- a/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
+++ b/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
define void @foo() !prof !1 {
; Test if a cold block in a loop will be placed at the end of the function
diff --git a/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll b/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
index 79b4883fb1d6..b30aaea9024b 100644
--- a/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
+++ b/test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
define void @foo() {
; Test that when determining the edge probability from a node in an inner loop
diff --git a/test/CodeGen/X86/code_placement_loop_rotation.ll b/test/CodeGen/X86/code_placement_loop_rotation.ll
index 3ec5961486e8..96fbc8138999 100644
--- a/test/CodeGen/X86/code_placement_loop_rotation.ll
+++ b/test/CodeGen/X86/code_placement_loop_rotation.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
define void @foo() {
diff --git a/test/CodeGen/X86/code_placement_loop_rotation2.ll b/test/CodeGen/X86/code_placement_loop_rotation2.ll
index 6d8b3c99cd05..ea95c5438e3b 100644
--- a/test/CodeGen/X86/code_placement_loop_rotation2.ll
+++ b/test/CodeGen/X86/code_placement_loop_rotation2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE
define void @foo() {
diff --git a/test/CodeGen/X86/code_placement_loop_rotation3.ll b/test/CodeGen/X86/code_placement_loop_rotation3.ll
new file mode 100644
index 000000000000..6a5b743ef8a1
--- /dev/null
+++ b/test/CodeGen/X86/code_placement_loop_rotation3.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -force-precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK
+
+define void @bar() {
+; Test that all edges in the loop chain are fall through with profile data.
+;
+; CHECK-LABEL: bar:
+; CHECK: latch
+; CHECK: header
+; CHECK: if.then
+; CHECK: end
+
+entry:
+ br label %header
+
+header:
+ call void @e()
+ %call = call zeroext i1 @a()
+ br i1 %call, label %if.then, label %latch, !prof !1
+
+if.then:
+ call void @f()
+ %call3 = call zeroext i1 @a()
+ br i1 %call3, label %latch, label %end, !prof !2
+
+latch:
+ call void @h()
+ %call2 = call zeroext i1 @a()
+ br i1 %call2, label %header, label %end, !prof !3
+
+end:
+ ret void
+}
+
+declare zeroext i1 @a()
+declare void @e()
+declare void @f()
+declare void @g()
+declare void @h()
+
+!1 = !{!"branch_weights", i32 16, i32 16}
+!2 = !{!"branch_weights", i32 97, i32 3}
+!3 = !{!"branch_weights", i32 97, i32 3}
diff --git a/test/CodeGen/X86/code_placement_outline_optional_branches.ll b/test/CodeGen/X86/code_placement_outline_optional_branches.ll
index 3364915fd1b7..5624d435215a 100644
--- a/test/CodeGen/X86/code_placement_outline_optional_branches.ll
+++ b/test/CodeGen/X86/code_placement_outline_optional_branches.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -outline-optional-branches < %s | FileCheck %s -check-prefix=CHECK-OUTLINE
define void @foo(i32 %t1, i32 %t2, i32 %t3) {
diff --git a/test/CodeGen/X86/combine-multiplies.ll b/test/CodeGen/X86/combine-multiplies.ll
index 5e51edbf52f9..15528cd0714b 100644
--- a/test/CodeGen/X86/combine-multiplies.ll
+++ b/test/CodeGen/X86/combine-multiplies.ll
@@ -31,10 +31,10 @@
;
; CHECK-LABEL: testCombineMultiplies
; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190
-; CHECK-NEXT: leal ([[MUL]],[[ARG2:%[a-z]+]]), [[LEA:%[a-z]+]]
+; CHECK-NEXT: leal ([[ARG2:%[a-z]+]],[[MUL]]), [[LEA:%[a-z]+]]
; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4)
-; CHECK-NEXT: movl $22, {{[0-9]+}}([[MUL]],[[ARG2]])
-; CHECK-NEXT: movl $33, {{[0-9]+}}([[MUL]],[[ARG2]])
+; CHECK-NEXT: movl $22, {{[0-9]+}}([[ARG2]],[[MUL]])
+; CHECK-NEXT: movl $33, {{[0-9]+}}([[ARG2]],[[MUL]])
; CHECK: retl
;
@@ -109,7 +109,7 @@ entry:
; CHECK-NEXT: movdqa [[C242]], v2
; CHECK-NEXT: [[C726]], v3
; CHECK-NEXT: [[C11]], x
-; CHECK-NEXT: retl
+; CHECK-NEXT: retl
@v2 = common global <4 x i32> zeroinitializer, align 16
@v3 = common global <4 x i32> zeroinitializer, align 16
@@ -148,7 +148,7 @@ entry:
; CHECK-NEXT: movdqa [[C242]], v2
; CHECK-NEXT: [[C726]], v3
; CHECK-NEXT: [[C11]], x
-; CHECK-NEXT: retl
+; CHECK-NEXT: retl
; Function Attrs: nounwind
define void @testCombineMultiplies_non_splat(<4 x i32> %v1) {
entry:
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index e17cfbeeee12..5cbd74980cab 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -356,3 +356,62 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
ret <4 x i8> %or
}
+; Verify that we can fold regardless of which operand is the zeroinitializer
+
+define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2b:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2c:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
+ %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+
+define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2d:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+ %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32><i32 4, i32 5, i32 0, i32 0>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+; Make sure we can have an undef where an index pointing to the zero vector should be
+
+define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2e:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
+
+define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2f:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+ %shuf2 = shufflevector <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 1, i32 4, i32 4>
+ %or = or <4 x i32> %shuf1, %shuf2
+ ret <4 x i32> %or
+}
diff --git a/test/CodeGen/X86/combine-testm-and.ll b/test/CodeGen/X86/combine-testm-and.ll
new file mode 100644
index 000000000000..2b95a114540d
--- /dev/null
+++ b/test/CodeGen/X86/combine-testm-and.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s
+
+define i32 @combineTESTM_AND_1(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: combineTESTM_AND_1:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %and.i = and <8 x i64> %b, %a
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 -1)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+define i32 @combineTESTM_AND_2(<8 x i64> %a, <8 x i64> %b , i8 %mask) {
+; CHECK-LABEL: combineTESTM_AND_2:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %and.i = and <8 x i64> %b, %a
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+define i32 @combineTESTM_AND_mask_3(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
+; CHECK-LABEL: combineTESTM_AND_mask_3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %b = load <8 x i64>, <8 x i64>* %bptr
+ %and.i = and <8 x i64> %a, %b
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+define i32 @combineTESTM_AND_mask_4(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
+; CHECK-LABEL: combineTESTM_AND_mask_4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovb %esi, %k1
+; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: retq
+ %b = load <8 x i64>, <8 x i64>* %bptr
+ %and.i = and <8 x i64> %b, %a
+ %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
+ %conv = zext i8 %test.i to i32
+ ret i32 %conv
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll
index bd497ba40767..c39aa0b12b32 100644
--- a/test/CodeGen/X86/commute-blend-avx2.ll
+++ b/test/CodeGen/X86/commute-blend-avx2.ll
@@ -1,89 +1,90 @@
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendw_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; CHECK-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %b
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
ret <8 x i16> %2
-
- ;LABEL: commute_fold_vpblendw_128
- ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
- ;CHECK-NEXT: retq
}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendw_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
+; CHECK-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %b
%2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17)
ret <16 x i16> %2
-
- ;LABEL: commute_fold_vpblendw_256
- ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
- ;CHECK-NEXT: retq
}
declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; CHECK-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %b
%2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
ret <4 x i32> %2
-
- ;LABEL: commute_fold_vpblendd_128
- ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
- ;CHECK-NEXT: retq
}
declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: commute_fold_vpblendd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
+; CHECK-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %b
%2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
ret <8 x i32> %2
-
- ;LABEL: commute_fold_vpblendd_256
- ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
- ;CHECK-NEXT: retq
}
declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendps_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
+; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b
%2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 5)
ret <4 x float> %2
-
- ;LABEL: commute_fold_vblendps_128
- ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
- ;CHECK-NEXT: retq
}
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
+; CHECK-NEXT: retq
%1 = load <8 x float>, <8 x float>* %b
%2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7)
ret <8 x float> %2
-
- ;LABEL: commute_fold_vblendps_256
- ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
- ;CHECK-NEXT: retq
}
declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendpd_128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double>* %b
%2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
ret <2 x double> %2
-
- ;LABEL: commute_fold_vblendpd_128
- ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
- ;CHECK-NEXT: retq
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: commute_fold_vblendpd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
+; CHECK-NEXT: retq
%1 = load <4 x double>, <4 x double>* %b
%2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
ret <4 x double> %2
-
- ;LABEL: commute_fold_vblendpd_256
- ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
- ;CHECK-NEXT: retq
}
declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll
index 8cebcdb8eeae..14a685b179a5 100644
--- a/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/test/CodeGen/X86/commute-blend-sse41.ll
@@ -1,34 +1,35 @@
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s
define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
+; CHECK-LABEL: commute_fold_pblendw:
+; CHECK: # BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; CHECK-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %b
%2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
ret <8 x i16> %2
-
- ;LABEL: commute_fold_pblendw
- ;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
- ;CHECK-NEXT: retq
}
declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
+; CHECK-LABEL: commute_fold_blendps:
+; CHECK: # BB#0:
+; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
+; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b
%2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 5)
ret <4 x float> %2
-
- ;LABEL: commute_fold_blendps
- ;CHECK: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
- ;CHECK-NEXT: retq
}
declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
+; CHECK-LABEL: commute_fold_blendpd:
+; CHECK: # BB#0:
+; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double>* %b
%2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
ret <2 x double> %2
-
- ;LABEL: commute_fold_vblendpd
- ;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
- ;CHECK-NEXT: retq
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/commute-fcmp.ll b/test/CodeGen/X86/commute-fcmp.ll
index 6f43ebe1fcd7..4274d1feaa3b 100644
--- a/test/CodeGen/X86/commute-fcmp.ll
+++ b/test/CodeGen/X86/commute-fcmp.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
@@ -6,164 +7,332 @@
; Only equal/not-equal/ordered/unordered can be safely commuted
;
-define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_eq
- ;SSE: cmpeqps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_eq
- ;AVX: vcmpeqps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_eq:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_eq:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp oeq <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_ne
- ;SSE: cmpneqps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_ne
- ;AVX: vcmpneqps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ne:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ne:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp une <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_ord
- ;SSE: cmpordps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_ord
- ;AVX: vcmpordps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ord:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ord:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp ord <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_uno
- ;SSE: cmpunordps (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_uno
- ;AVX: vcmpunordps (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_uno:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_uno:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp uno <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_lt
- ;SSE: movaps (%rdi), %xmm1
- ;SSE-NEXT: cmpltps %xmm0, %xmm1
- ;SSE-NEXT: movaps %xmm1, %xmm0
- ;SSE-NEXT: retq
+define <4 x i32> @commute_cmpps_ueq(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ueq:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: cmpeqps %xmm0, %xmm2
+; SSE-NEXT: cmpunordps %xmm1, %xmm0
+; SSE-NEXT: orps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ueq:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpeqps %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vorps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x float>, <4 x float>* %a0
+ %2 = fcmp ueq <4 x float> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i32>
+ ret <4 x i32> %3
+}
- ;AVX-LABEL: commute_cmpps_lt
- ;AVX: vmovaps (%rdi), %xmm1
- ;AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
+define <4 x i32> @commute_cmpps_one(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_one:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: cmpneqps %xmm0, %xmm2
+; SSE-NEXT: cmpordps %xmm1, %xmm0
+; SSE-NEXT: andps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_one:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpneqps %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpordps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x float>, <4 x float>* %a0
+ %2 = fcmp one <4 x float> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i32>
+ ret <4 x i32> %3
+}
+define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_lt:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: cmpltps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_lt:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp olt <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) #0 {
- ;SSE-LABEL: commute_cmpps_le
- ;SSE: movaps (%rdi), %xmm1
- ;SSE-NEXT: cmpleps %xmm0, %xmm1
- ;SSE-NEXT: movaps %xmm1, %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmpps_le
- ;AVX: vmovaps (%rdi), %xmm1
- ;AVX-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
-
+define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) {
+; SSE-LABEL: commute_cmpps_le:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm1
+; SSE-NEXT: cmpleps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_le:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp ole <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
ret <4 x i32> %3
}
-define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_eq_ymm
- ;AVX: vcmpeqps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_eq_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqps (%rdi), %xmm0
+; SSE-NEXT: cmpeqps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_eq_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp oeq <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_ne_ymm
- ;AVX: vcmpneqps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ne_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqps (%rdi), %xmm0
+; SSE-NEXT: cmpneqps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ne_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp une <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_ord_ymm
- ;AVX: vcmpordps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ord_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordps (%rdi), %xmm0
+; SSE-NEXT: cmpordps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ord_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp ord <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_uno_ymm
- ;AVX: vcmpunordps (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_uno_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordps (%rdi), %xmm0
+; SSE-NEXT: cmpunordps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_uno_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp uno <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_lt_ymm
- ;AVX: vmovaps (%rdi), %ymm1
- ;AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
+define <8 x i32> @commute_cmpps_ueq_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_ueq_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: cmpeqps %xmm0, %xmm4
+; SSE-NEXT: cmpunordps %xmm2, %xmm0
+; SSE-NEXT: orps %xmm4, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: cmpeqps %xmm1, %xmm2
+; SSE-NEXT: cmpunordps %xmm3, %xmm1
+; SSE-NEXT: orps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_ueq_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpeqps %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpunordps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <8 x float>, <8 x float>* %a0
+ %2 = fcmp ueq <8 x float> %1, %a1
+ %3 = sext <8 x i1> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+
+define <8 x i32> @commute_cmpps_one_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_one_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: cmpneqps %xmm0, %xmm4
+; SSE-NEXT: cmpordps %xmm2, %xmm0
+; SSE-NEXT: andps %xmm4, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: cmpneqps %xmm1, %xmm2
+; SSE-NEXT: cmpordps %xmm3, %xmm1
+; SSE-NEXT: andps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_one_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpneqps %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpordps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <8 x float>, <8 x float>* %a0
+ %2 = fcmp one <8 x float> %1, %a1
+ %3 = sext <8 x i1> %2 to <8 x i32>
+ ret <8 x i32> %3
+}
+define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_lt_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: cmpltps %xmm0, %xmm2
+; SSE-NEXT: cmpltps %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_lt_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp olt <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
ret <8 x i32> %3
}
-define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
- ;AVX-LABEL: commute_cmpps_le_ymm
- ;AVX: vmovaps (%rdi), %ymm1
- ;AVX-NEXT: vcmpleps %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
-
+define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) {
+; SSE-LABEL: commute_cmpps_le_ymm:
+; SSE: # BB#0:
+; SSE-NEXT: movaps (%rdi), %xmm2
+; SSE-NEXT: movaps 16(%rdi), %xmm3
+; SSE-NEXT: cmpleps %xmm0, %xmm2
+; SSE-NEXT: cmpleps %xmm1, %xmm3
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmpps_le_ymm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vcmpleps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp ole <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -175,164 +344,332 @@ define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) #0 {
; Only equal/not-equal/ordered/unordered can be safely commuted
;
-define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_eq
- ;SSE: cmpeqpd (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_eq
- ;AVX: vcmpeqpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_eq:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_eq:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp oeq <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_ne
- ;SSE: cmpneqpd (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_ne
- ;AVX: vcmpneqpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ne:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ne:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp une <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_ord
- ;SSE: cmpordpd (%rdi), %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_ord
- ;AVX: vcmpordpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ord:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ord:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp ord <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_uno
- ;SSE: cmpunordpd (%rdi), %xmm0
- ;SSE-NEXT: retq
+define <2 x i64> @commute_cmppd_ueq(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ueq:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm2
+; SSE-NEXT: cmpeqpd %xmm0, %xmm2
+; SSE-NEXT: cmpunordpd %xmm1, %xmm0
+; SSE-NEXT: orpd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ueq:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpunordpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vorpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <2 x double>, <2 x double>* %a0
+ %2 = fcmp ueq <2 x double> %1, %a1
+ %3 = sext <2 x i1> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
- ;AVX-LABEL: commute_cmppd_uno
- ;AVX: vcmpunordpd (%rdi), %xmm0, %xmm0
- ;AVX-NEXT: retq
+define <2 x i64> @commute_cmppd_one(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_one:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm2
+; SSE-NEXT: cmpneqpd %xmm0, %xmm2
+; SSE-NEXT: cmpordpd %xmm1, %xmm0
+; SSE-NEXT: andpd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_one:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmpneqpd %xmm0, %xmm1, %xmm2
+; AVX-NEXT: vcmpordpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %1 = load <2 x double>, <2 x double>* %a0
+ %2 = fcmp one <2 x double> %1, %a1
+ %3 = sext <2 x i1> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
+define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_uno:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_uno:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp uno <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_lt
- ;SSE: movapd (%rdi), %xmm1
- ;SSE-NEXT: cmpltpd %xmm0, %xmm1
- ;SSE-NEXT: movapd %xmm1, %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_lt
- ;AVX: vmovapd (%rdi), %xmm1
- ;AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_lt:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: cmpltpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_lt:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp olt <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) #0 {
- ;SSE-LABEL: commute_cmppd_le
- ;SSE: movapd (%rdi), %xmm1
- ;SSE-NEXT: cmplepd %xmm0, %xmm1
- ;SSE-NEXT: movapd %xmm1, %xmm0
- ;SSE-NEXT: retq
-
- ;AVX-LABEL: commute_cmppd_le
- ;AVX: vmovapd (%rdi), %xmm1
- ;AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
- ;AVX-NEXT: retq
-
+define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) {
+; SSE-LABEL: commute_cmppd_le:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: cmplepd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_le:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp ole <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
ret <2 x i64> %3
}
-define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_eq
- ;AVX: vcmpeqpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_eq_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpeqpd (%rdi), %xmm0
+; SSE-NEXT: cmpeqpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_eq_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp oeq <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_ne
- ;AVX: vcmpneqpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ne_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpneqpd (%rdi), %xmm0
+; SSE-NEXT: cmpneqpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ne_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpneqpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp une <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_ord
- ;AVX: vcmpordpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ord_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpordpd (%rdi), %xmm0
+; SSE-NEXT: cmpordpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ord_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpordpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp ord <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_uno
- ;AVX: vcmpunordpd (%rdi), %ymm0, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_uno_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: cmpunordpd (%rdi), %xmm0
+; SSE-NEXT: cmpunordpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_uno_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpunordpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp uno <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_lt
- ;AVX: vmovapd (%rdi), %ymm1
- ;AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
+define <4 x i64> @commute_cmppd_ueq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_ueq_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm4
+; SSE-NEXT: cmpeqpd %xmm0, %xmm4
+; SSE-NEXT: cmpunordpd %xmm2, %xmm0
+; SSE-NEXT: orpd %xmm4, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm2
+; SSE-NEXT: cmpeqpd %xmm1, %xmm2
+; SSE-NEXT: cmpunordpd %xmm3, %xmm1
+; SSE-NEXT: orpd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_ueq_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmpeqpd %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpunordpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vorpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x double>, <4 x double>* %a0
+ %2 = fcmp ueq <4 x double> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_cmppd_one_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_one_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm4
+; SSE-NEXT: cmpneqpd %xmm0, %xmm4
+; SSE-NEXT: cmpordpd %xmm2, %xmm0
+; SSE-NEXT: andpd %xmm4, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm2
+; SSE-NEXT: cmpneqpd %xmm1, %xmm2
+; SSE-NEXT: cmpordpd %xmm3, %xmm1
+; SSE-NEXT: andpd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_one_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmpneqpd %ymm0, %ymm1, %ymm2
+; AVX-NEXT: vcmpordpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+ %1 = load <4 x double>, <4 x double>* %a0
+ %2 = fcmp one <4 x double> %1, %a1
+ %3 = sext <4 x i1> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_lt_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: cmpltpd %xmm0, %xmm2
+; SSE-NEXT: cmpltpd %xmm1, %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_lt_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp olt <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
ret <4 x i64> %3
}
-define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) #0 {
- ;AVX-LABEL: commute_cmppd_le
- ;AVX: vmovapd (%rdi), %ymm1
- ;AVX-NEXT: vcmplepd %ymm0, %ymm1, %ymm0
- ;AVX-NEXT: retq
-
+define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) {
+; SSE-LABEL: commute_cmppd_le_ymmm:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm2
+; SSE-NEXT: movapd 16(%rdi), %xmm3
+; SSE-NEXT: cmplepd %xmm0, %xmm2
+; SSE-NEXT: cmplepd %xmm1, %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movapd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: commute_cmppd_le_ymmm:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %ymm1
+; AVX-NEXT: vcmplepd %ymm0, %ymm1, %ymm0
+; AVX-NEXT: retq
+;
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp ole <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index e7c846045f01..fd94f595005a 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll
@@ -16,22 +16,22 @@ entry:
}
; CTOR: .section .ctors.65520,"aGw",@progbits,v,comdat
-; CTOR-NEXT: .align 8
+; CTOR-NEXT: .p2align 3
; CTOR-NEXT: .quad g
; CTOR-NEXT: .section .ctors,"aw",@progbits
-; CTOR-NEXT: .align 8
+; CTOR-NEXT: .p2align 3
; CTOR-NEXT: .quad f
; INIT-ARRAY: .section .init_array.15,"aGw",@init_array,v,comdat
-; INIT-ARRAY-NEXT: .align 8
+; INIT-ARRAY-NEXT: .p2align 3
; INIT-ARRAY-NEXT: .quad g
; INIT-ARRAY-NEXT: .section .init_array,"aw",@init_array
-; INIT-ARRAY-NEXT: .align 8
+; INIT-ARRAY-NEXT: .p2align 3
; INIT-ARRAY-NEXT: .quad f
; NACL: .section .init_array.15,"aGw",@init_array,v,comdat
-; NACL-NEXT: .align 4
+; NACL-NEXT: .p2align 2
; NACL-NEXT: .long g
; NACL-NEXT: .section .init_array,"aw",@init_array
-; NACL-NEXT: .align 4
+; NACL-NEXT: .p2align 2
; NACL-NEXT: .long f
diff --git a/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll b/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
new file mode 100644
index 000000000000..b4c30a7380c8
--- /dev/null
+++ b/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
@@ -0,0 +1,268 @@
+; RUN: llc %s -o - | FileCheck %s
+; This file checks some weird corner case in LiveRangeEdit.
+; We used to do crash when we eliminate the definition
+; of the product of splitting when the original live-range
+; has already been removed.
+; Basically, we have the following input.
+; v1 = loadimm cst
+; ...
+; = use v1
+;
+; We split the live-range like this:
+; v1 = loadimm cst
+; ...
+; v2 = copy v1
+; ...
+; = use v2
+;
+; We actually issue loadimm instead of the copy:
+; v1 = loadimm cst
+; ...
+; v2 = loadimm cst
+; ...
+; = use v2
+;
+; v1 is now dead so we remove its live-range.
+; Actually, we shrink it to empty to keep the
+; instruction around for futher remat opportunities
+; (accessbile via the origin pointer.)
+;
+; Later v2 gets remove as well (e.g., because we
+; remat it closer to its use) and the live-range
+; gets eliminated. We used to crash at this point
+; because we were looking for a VNI of origin (v1)
+; at the slot index of the definition of v2. However,
+; we do not have a VNI for v1 at this point, since the
+; live-range is now empty... crash!
+; PR27983
+
+source_filename = "bugpoint-output-1e29d28.bc"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@r = external global i32, align 4
+@k = external global i32, align 4
+@g = external global i32, align 4
+@a = external global i16, align 2
+@p = external global i32, align 4
+@n = external global i16, align 2
+@.str = external unnamed_addr constant [12 x i8], align 1
+@.str.1 = external unnamed_addr constant [13 x i8], align 1
+@s = external global i32, align 4
+@z = external global i16, align 2
+
+; CHECK-LABEL: fn1:
+define void @fn1() #0 {
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %cleanup100, %for.end29, %entry
+ %t7.0 = phi i16 [ undef, %entry ], [ %t7.1, %for.end29 ], [ %t7.19, %cleanup100 ]
+ %t9.0 = phi i32 [ undef, %entry ], [ %t9.1, %for.end29 ], [ 0, %cleanup100 ]
+ %t2.0 = phi i32 [ undef, %entry ], [ undef, %for.end29 ], [ %t2.18, %cleanup100 ]
+ %tmp = load i32, i32* @r, align 4
+ br i1 undef, label %if.then, label %if.end7
+
+if.then: ; preds = %for.cond
+ %tobool = icmp ne i32 %tmp, 0
+ %tobool1 = icmp ne i32 %t2.0, 0
+ %tmp1 = and i1 %tobool1, %tobool
+ %land.ext = zext i1 %tmp1 to i32
+ %tmp2 = load i32, i32* @k, align 4
+ %shr = lshr i32 %land.ext, %tmp2
+ %tobool4 = icmp ne i32 %shr, 0
+ %or.cond = and i1 false, %tobool4
+ br i1 %or.cond, label %L6, label %if.end7
+
+if.end7: ; preds = %if.then, %for.cond
+ %t2.1 = phi i32 [ %shr, %if.then ], [ %t2.0, %for.cond ]
+ %tobool8 = icmp eq i32 undef, 0
+ br i1 %tobool8, label %if.end11, label %for.cond10
+
+for.cond10: ; preds = %for.cond10, %if.end7
+ br label %for.cond10
+
+if.end11: ; preds = %if.end7
+ %tmp3 = load i32, i32* @g, align 4
+ %tmp4 = load i16, i16* @a, align 2
+ %conv = sext i16 %tmp4 to i32
+ %div = sdiv i32 %tmp3, %conv
+ %tobool12 = icmp eq i32 %div, 0
+ br i1 %tobool12, label %for.cond15, label %L5
+
+for.cond15: ; preds = %for.cond17, %if.end11
+ %t7.1 = phi i16 [ %t7.2, %for.cond17 ], [ %t7.0, %if.end11 ]
+ %t9.1 = phi i32 [ %t9.2, %for.cond17 ], [ %t9.0, %if.end11 ]
+ %tobool16 = icmp eq i32 undef, 0
+ br i1 %tobool16, label %for.end29, label %for.cond17
+
+for.cond17: ; preds = %for.cond20, %for.cond15
+ %t7.2 = phi i16 [ %t7.3, %for.cond20 ], [ %t7.1, %for.cond15 ]
+ %t9.2 = phi i32 [ undef, %for.cond20 ], [ %t9.1, %for.cond15 ]
+ %tobool18 = icmp eq i8 undef, 0
+ br i1 %tobool18, label %for.cond15, label %for.cond20
+
+for.cond20: ; preds = %for.cond23, %for.cond17
+ %t7.3 = phi i16 [ %t7.4, %for.cond23 ], [ %t7.2, %for.cond17 ]
+ %tobool21 = icmp eq i32 undef, 0
+ br i1 %tobool21, label %for.cond17, label %for.cond23
+
+for.cond23: ; preds = %L1, %for.cond20
+ %t7.4 = phi i16 [ %t7.5, %L1 ], [ %t7.3, %for.cond20 ]
+ %tobool24 = icmp eq i8 undef, 0
+ br i1 %tobool24, label %for.cond20, label %L1
+
+L1: ; preds = %cleanup100, %for.cond23
+ %t7.5 = phi i16 [ %t7.19, %cleanup100 ], [ %t7.4, %for.cond23 ]
+ %conv26 = sext i16 undef to i64
+ br label %for.cond23
+
+for.end29: ; preds = %for.cond15
+ br i1 undef, label %for.cond, label %for.cond32thread-pre-split
+
+for.cond32thread-pre-split: ; preds = %for.end29
+ %.pr = load i32, i32* @p, align 4
+ br label %for.cond32
+
+for.cond32: ; preds = %for.inc94, %for.cond32thread-pre-split
+ %t7.6 = phi i16 [ %t7.1, %for.cond32thread-pre-split ], [ %t7.17, %for.inc94 ]
+ %t3.4 = phi i64 [ 0, %for.cond32thread-pre-split ], [ 0, %for.inc94 ]
+ %t9.6 = phi i32 [ %t9.1, %for.cond32thread-pre-split ], [ 0, %for.inc94 ]
+ %t2.7 = phi i32 [ undef, %for.cond32thread-pre-split ], [ %t2.16, %for.inc94 ]
+ %tobool33 = icmp eq i32 0, 0
+ br i1 %tobool33, label %for.end95, label %for.body34
+
+for.body34: ; preds = %for.cond32
+ %tobool35 = icmp eq i16 undef, 0
+ br i1 %tobool35, label %for.inc94, label %if.then36
+
+if.then36: ; preds = %for.body34
+ %tmp5 = load i16, i16* @n, align 2
+ %tobool37 = icmp eq i32 undef, 0
+ br i1 %tobool37, label %if.end78, label %if.then38
+
+if.then38: ; preds = %if.then36
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i64 undef)
+ %tobool40 = icmp eq i32 undef, 0
+ br i1 %tobool40, label %L3, label %cleanup100
+
+L3: ; preds = %while.end.split, %if.then38
+ %t7.7 = phi i16 [ %tmp5, %if.then38 ], [ %t7.15, %while.end.split ]
+ %t3.5 = phi i64 [ %t3.4, %if.then38 ], [ %t3.11, %while.end.split ]
+ %t2.8 = phi i32 [ %t2.7, %if.then38 ], [ %t2.14, %while.end.split ]
+ %tobool43 = icmp eq i32 undef, 0
+ br i1 %tobool43, label %if.end48, label %cleanup75
+
+if.end48: ; preds = %L3
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i64 %t3.5)
+ br i1 undef, label %if.end61, label %for.cond52.preheader
+
+for.cond52.preheader: ; preds = %if.end48
+ %tobool57 = icmp eq i16 undef, 0
+ %.130 = select i1 %tobool57, i16 -8, i16 0
+ br label %if.end61
+
+if.end61: ; preds = %for.cond52.preheader, %if.end48
+ %t7.9 = phi i16 [ %t7.7, %if.end48 ], [ %.130, %for.cond52.preheader ]
+ %tobool62 = icmp eq i32 undef, 0
+ br i1 %tobool62, label %if.end71, label %if.then63
+
+if.then63: ; preds = %if.end61
+ br i1 undef, label %if.end67, label %L5
+
+L5: ; preds = %cleanup100.L5_crit_edge, %if.then63, %if.end11
+ %.pre = phi i32 [ %.pre.pre, %cleanup100.L5_crit_edge ], [ undef, %if.then63 ], [ %tmp, %if.end11 ]
+ %t7.10 = phi i16 [ %t7.19, %cleanup100.L5_crit_edge ], [ %t7.9, %if.then63 ], [ %t7.0, %if.end11 ]
+ %t3.6 = phi i64 [ 0, %cleanup100.L5_crit_edge ], [ %t3.5, %if.then63 ], [ 2, %if.end11 ]
+ %t9.8 = phi i32 [ 0, %cleanup100.L5_crit_edge ], [ undef, %if.then63 ], [ %t9.0, %if.end11 ]
+ %t2.9 = phi i32 [ %t2.18, %cleanup100.L5_crit_edge ], [ %t2.8, %if.then63 ], [ %t2.1, %if.end11 ]
+ store i32 %t9.8, i32* @s, align 4
+ br label %if.end67
+
+if.end67: ; preds = %L5, %if.then63
+ %tmp6 = phi i32 [ %.pre, %L5 ], [ undef, %if.then63 ]
+ %t7.11 = phi i16 [ %t7.10, %L5 ], [ %t7.9, %if.then63 ]
+ %t3.7 = phi i64 [ %t3.6, %L5 ], [ %t3.5, %if.then63 ]
+ %t9.9 = phi i32 [ %t9.8, %L5 ], [ undef, %if.then63 ]
+ %t2.10 = phi i32 [ %t2.9, %L5 ], [ %t2.8, %if.then63 ]
+ %tobool68 = icmp eq i32 %tmp6, 0
+ br i1 %tobool68, label %if.end71, label %for.end95
+
+if.end71: ; preds = %if.end67, %if.end61
+ %t7.12 = phi i16 [ %t7.11, %if.end67 ], [ %t7.9, %if.end61 ]
+ %t3.8 = phi i64 [ %t3.7, %if.end67 ], [ %t3.5, %if.end61 ]
+ %tobool72 = icmp eq i32 undef, 0
+ br i1 %tobool72, label %cleanup75.thread128, label %if.then73
+
+if.then73: ; preds = %if.end71
+ br label %cleanup100
+
+cleanup75.thread128: ; preds = %if.end71
+ br label %if.end78
+
+cleanup75: ; preds = %L3
+ br i1 false, label %for.cond98, label %for.end95
+
+if.end78: ; preds = %cleanup75.thread128, %if.then36
+ %t7.14 = phi i16 [ %tmp5, %if.then36 ], [ 0, %cleanup75.thread128 ]
+ %t3.10 = phi i64 [ %t3.4, %if.then36 ], [ %t3.8, %cleanup75.thread128 ]
+ %t9.12 = phi i32 [ %t9.6, %if.then36 ], [ undef, %cleanup75.thread128 ]
+ %t2.13 = phi i32 [ %t2.7, %if.then36 ], [ undef, %cleanup75.thread128 ]
+ store i16 %t7.14, i16* @z, align 2
+ br label %L6
+
+L6: ; preds = %if.end78, %if.then
+ %t7.15 = phi i16 [ %t7.0, %if.then ], [ %t7.14, %if.end78 ]
+ %t3.11 = phi i64 [ 2, %if.then ], [ %t3.10, %if.end78 ]
+ %t9.13 = phi i32 [ %t9.0, %if.then ], [ %t9.12, %if.end78 ]
+ %t2.14 = phi i32 [ %shr, %if.then ], [ %t2.13, %if.end78 ]
+ br i1 undef, label %while.condthread-pre-split, label %for.inc94
+
+while.condthread-pre-split: ; preds = %L6
+ %tobool83 = icmp eq i32 undef, 0
+ br i1 %tobool83, label %while.end.split, label %while.cond
+
+while.cond: ; preds = %while.cond, %while.condthread-pre-split
+ br label %while.cond
+
+while.end.split: ; preds = %while.condthread-pre-split
+ %tobool84 = icmp eq i16 undef, 0
+ br i1 %tobool84, label %for.inc94, label %L3
+
+for.inc94: ; preds = %while.end.split, %L6, %for.body34
+ %t7.17 = phi i16 [ %t7.6, %for.body34 ], [ %t7.15, %L6 ], [ %t7.15, %while.end.split ]
+ %t2.16 = phi i32 [ %t2.7, %for.body34 ], [ %t2.14, %L6 ], [ %t2.14, %while.end.split ]
+ store i32 undef, i32* @p, align 4
+ br label %for.cond32
+
+for.end95: ; preds = %cleanup75, %if.end67, %for.cond32
+ %t7.18 = phi i16 [ %t7.6, %for.cond32 ], [ %t7.7, %cleanup75 ], [ %t7.11, %if.end67 ]
+ %t2.17 = phi i32 [ %t2.7, %for.cond32 ], [ %t2.8, %cleanup75 ], [ %t2.10, %if.end67 ]
+ %tobool96 = icmp eq i32 undef, 0
+ br i1 %tobool96, label %cleanup100, label %for.cond98
+
+for.cond98: ; preds = %for.cond98, %for.end95, %cleanup75
+ br label %for.cond98
+
+cleanup100: ; preds = %for.end95, %if.then73, %if.then38
+ %t7.19 = phi i16 [ %t7.18, %for.end95 ], [ %tmp5, %if.then38 ], [ %t7.12, %if.then73 ]
+ %t2.18 = phi i32 [ %t2.17, %for.end95 ], [ %t2.7, %if.then38 ], [ undef, %if.then73 ]
+ switch i32 undef, label %unreachable [
+ i32 0, label %for.cond
+ i32 17, label %L1
+ i32 7, label %cleanup100.L5_crit_edge
+ ]
+
+cleanup100.L5_crit_edge: ; preds = %cleanup100
+ %.pre.pre = load i32, i32* @r, align 4
+ br label %L5
+
+unreachable: ; preds = %cleanup100
+ unreachable
+}
+
+; Function Attrs: nounwind
+declare void @printf(i8* nocapture readonly, ...) #1
+
+attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 463505bd95d9..435401639f05 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -1,40 +1,50 @@
-; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s
declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define i32 @test1(i64 %x) nounwind readnone {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: leaq -1(%rdi), %rcx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testq %rcx, %rdi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cast = trunc i64 %count to i32
%cmp = icmp ugt i32 %cast, 1
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: test1:
-; CHECK: leaq -1([[A0:%rdi|%rcx]])
-; CHECK-NEXT: testq
-; CHECK-NEXT: setne
-; CHECK: ret
}
define i32 @test2(i64 %x) nounwind readnone {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: leaq -1(%rdi), %rcx
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testq %rcx, %rdi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cmp = icmp ult i64 %count, 2
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: test2:
-; CHECK: leaq -1([[A0]])
-; CHECK-NEXT: testq
-; CHECK-NEXT: sete
-; CHECK: ret
}
define i32 @test3(i64 %x) nounwind readnone {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: popcntq %rdi, %rax
+; CHECK-NEXT: andb $63, %al
+; CHECK-NEXT: cmpb $2, %al
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
%count = tail call i64 @llvm.ctpop.i64(i64 %x)
%cast = trunc i64 %count to i6 ; Too small for 0-64
%cmp = icmp ult i6 %cast, 2
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: test3:
-; CHECK: cmpl $2
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/cxx_tlscc64.ll b/test/CodeGen/X86/cxx_tlscc64.ll
index 6c8e45e42d15..ef947367c09e 100644
--- a/test/CodeGen/X86/cxx_tlscc64.ll
+++ b/test/CodeGen/X86/cxx_tlscc64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; TLS function were wrongly model and after fixing that, shrink-wrapping
; cannot help here. To achieve the expected lowering, we need to playing
; tricks similar to AArch64 fast TLS calling convention (r255821).
@@ -39,6 +39,27 @@ declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
; CHECK-NOT: popq %r9
; CHECK-NOT: popq %r10
; CHECK-NOT: popq %r11
+
+; CHECK-O0-LABEL: _ZTW2sg
+; CHECK-O0: pushq %r11
+; CHECK-O0: pushq %r10
+; CHECK-O0: pushq %r9
+; CHECK-O0: pushq %r8
+; CHECK-O0: pushq %rsi
+; CHECK-O0: pushq %rdx
+; CHECK-O0: pushq %rcx
+; CHECK-O0: callq
+; CHECK-O0: jne
+; CHECK-O0: callq
+; CHECK-O0: tlv_atexit
+; CHECK-O0: callq
+; CHECK-O0: popq %rcx
+; CHECK-O0: popq %rdx
+; CHECK-O0: popq %rsi
+; CHECK-O0: popq %r8
+; CHECK-O0: popq %r9
+; CHECK-O0: popq %r10
+; CHECK-O0: popq %r11
define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
%.b.i = load i1, i1* @__tls_guard, align 1
br i1 %.b.i, label %__tls_init.exit, label %init.i
@@ -63,6 +84,24 @@ __tls_init.exit:
; CHECK-NOT: pushq %rcx
; CHECK-NOT: pushq %rbx
; CHECK: callq
+; CHECK-O0-LABEL: _ZTW4sum1
+; CHECK-O0-NOT: pushq %r11
+; CHECK-O0-NOT: pushq %r10
+; CHECK-O0-NOT: pushq %r9
+; CHECK-O0-NOT: pushq %r8
+; CHECK-O0-NOT: pushq %rsi
+; CHECK-O0-NOT: pushq %rdx
+; CHECK-O0-NOT: pushq %rcx
+; CHECK-O0-NOT: pushq %rbx
+; CHECK-O0-NOT: movq %r11
+; CHECK-O0-NOT: movq %r10
+; CHECK-O0-NOT: movq %r9
+; CHECK-O0-NOT: movq %r8
+; CHECK-O0-NOT: movq %rsi
+; CHECK-O0-NOT: movq %rdx
+; CHECK-O0-NOT: movq %rcx
+; CHECK-O0-NOT: movq %rbx
+; CHECK-O0: callq
define cxx_fast_tlscc nonnull i32* @_ZTW4sum1() nounwind {
ret i32* @sum1
}
@@ -76,4 +115,57 @@ define cxx_fast_tlscc i32* @_ZTW4sum2() #0 {
ret i32* @sum1
}
+; Make sure at O0, we don't generate spilling/reloading of the CSRs.
+; CHECK-O0-LABEL: tls_test2
+; CHECK-O0-NOT: pushq %r11
+; CHECK-O0-NOT: pushq %r10
+; CHECK-O0-NOT: pushq %r9
+; CHECK-O0-NOT: pushq %r8
+; CHECK-O0-NOT: pushq %rsi
+; CHECK-O0-NOT: pushq %rdx
+; CHECK-O0: callq {{.*}}tls_helper
+; CHECK-O0-NOT: popq %rdx
+; CHECK-O0-NOT: popq %rsi
+; CHECK-O0-NOT: popq %r8
+; CHECK-O0-NOT: popq %r9
+; CHECK-O0-NOT: popq %r10
+; CHECK-O0-NOT: popq %r11
+; CHECK-O0: ret
+%class.C = type { i32 }
+@tC = internal thread_local global %class.C zeroinitializer, align 4
+declare cxx_fast_tlscc void @tls_helper()
+define cxx_fast_tlscc %class.C* @tls_test2() #1 {
+ call cxx_fast_tlscc void @tls_helper()
+ ret %class.C* @tC
+}
+
+; Make sure we do not allow tail call when caller and callee have different
+; calling conventions.
+declare %class.C* @_ZN1CD1Ev(%class.C* readnone returned %this)
+; CHECK-LABEL: tls_test
+; CHECK: callq {{.*}}tlv_atexit
+define cxx_fast_tlscc void @tls_test() {
+entry:
+ store i32 0, i32* getelementptr inbounds (%class.C, %class.C* @tC, i64 0, i32 0), align 4
+ %0 = tail call i32 @_tlv_atexit(void (i8*)* bitcast (%class.C* (%class.C*)* @_ZN1CD1Ev to void (i8*)*), i8* bitcast (%class.C* @tC to i8*), i8* nonnull @__dso_handle) #1
+ ret void
+}
+
+@ssp_var = internal thread_local global i8 0, align 1
+
+; CHECK-LABEL: test_ssp
+; CHECK-NOT: pushq %r11
+; CHECK-NOT: pushq %r10
+; CHECK-NOT: pushq %r9
+; CHECK-NOT: pushq %r8
+; CHECK-NOT: pushq %rsi
+; CHECK-NOT: pushq %rdx
+; CHECK-NOT: pushq %rcx
+; CHECK-NOT: pushq %rbx
+; CHECK: callq
+define cxx_fast_tlscc nonnull i8* @test_ssp() #2 {
+ ret i8* @ssp_var
+}
attributes #0 = { nounwind "no-frame-pointer-elim"="true" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind sspreq }
diff --git a/test/CodeGen/X86/dag-optnone.ll b/test/CodeGen/X86/dag-optnone.ll
index f7774e6e8c54..e0e12553dac6 100644
--- a/test/CodeGen/X86/dag-optnone.ll
+++ b/test/CodeGen/X86/dag-optnone.ll
@@ -23,13 +23,12 @@
; The test cases @foo[WithOptnone] prove that the same DAG combine happens
-; with -O0 and with 'optnone' set. To prove this, we use a Windows triple to
-; cause fast-isel to bail out (because something about the calling convention
-; is not handled in fast-isel). Then we have a repeated fadd that can be
-; combined into an fmul. We show that this happens in both the non-optnone
-; function and the optnone function.
+; with -O0 and with 'optnone' set. To prove this, we use a varags to cause
+; fast-isel to bail out (varags aren't handled in fast isel). Then we have
+; a repeated fadd that can be combined into an fmul. We show that this
+; happens in both the non-optnone function and the optnone function.
-define float @foo(float %x) #0 {
+define float @foo(float %x, ...) #0 {
entry:
%add = fadd fast float %x, %x
%add1 = fadd fast float %add, %x
@@ -41,7 +40,7 @@ entry:
; CHECK: mul
; CHECK-NEXT: ret
-define float @fooWithOptnone(float %x) #1 {
+define float @fooWithOptnone(float %x, ...) #1 {
entry:
%add = fadd fast float %x, %x
%add1 = fadd fast float %add, %x
@@ -60,7 +59,7 @@ entry:
@id84 = common global <16 x i32> zeroinitializer, align 64
-define void @bar() #1 {
+define void @bar(...) #1 {
entry:
%id83 = alloca <16 x i8>, align 16
%0 = load <16 x i32>, <16 x i32>* @id84, align 64
diff --git a/test/CodeGen/X86/darwin-stub.ll b/test/CodeGen/X86/darwin-stub.ll
deleted file mode 100644
index 607f56fdd60b..000000000000
--- a/test/CodeGen/X86/darwin-stub.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep stub
-; RUN: llc < %s -mtriple=i386-apple-darwin9 | not grep stub
-
-@"\01LC" = internal constant [13 x i8] c"Hello World!\00" ; <[13 x i8]*> [#uses=1]
-
-define i32 @main() nounwind {
-entry:
- %0 = tail call i32 @puts(i8* getelementptr ([13 x i8], [13 x i8]* @"\01LC", i32 0, i32 0)) nounwind ; <i32> [#uses=0]
- ret i32 0
-}
-
-declare i32 @puts(i8*)
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 54bd48926834..1ff4d10c2f8f 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -12,7 +12,7 @@
; CHECK: je .LBB0_4
; Regenerate test with this command:
-; clang -emit-llvm -S -O2 -g
+; clang++ -emit-llvm -S -O2 -g
; from this source:
;
; extern void foo(char *dst,unsigned siz,const char *src);
@@ -44,161 +44,170 @@
%struct.AAA3 = type { [4 x i8] }
@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
-@.str1 = private unnamed_addr constant [2 x i8] c"+\00", align 1
-@.str2 = private unnamed_addr constant [2 x i8] c"-\00", align 1
+@.str.1 = private unnamed_addr constant [2 x i8] c"+\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"-\00", align 1
; Function Attrs: uwtable
define void @_Z3barii(i32 %param1, i32 %param2) #0 !dbg !24 {
entry:
%var1 = alloca %struct.AAA3, align 1
%var2 = alloca %struct.AAA3, align 1
- tail call void @llvm.dbg.value(metadata i32 %param1, i64 0, metadata !30, metadata !DIExpression()), !dbg !47
- tail call void @llvm.dbg.value(metadata i32 %param2, i64 0, metadata !31, metadata !DIExpression()), !dbg !47
- tail call void @llvm.dbg.value(metadata i8* null, i64 0, metadata !32, metadata !DIExpression()), !dbg !49
+ tail call void @llvm.dbg.value(metadata i32 %param1, i64 0, metadata !29, metadata !46), !dbg !47
+ tail call void @llvm.dbg.value(metadata i32 %param2, i64 0, metadata !30, metadata !46), !dbg !48
+ tail call void @llvm.dbg.value(metadata i8* null, i64 0, metadata !31, metadata !46), !dbg !49
%tobool = icmp eq i32 %param2, 0, !dbg !50
- br i1 %tobool, label %if.end, label %if.then, !dbg !50
+ br i1 %tobool, label %if.end, label %if.then, !dbg !52
if.then: ; preds = %entry
- %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52
- tail call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata !32, metadata !DIExpression()), !dbg !49
- br label %if.end, !dbg !54
+ %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !53
+ tail call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata !31, metadata !46), !dbg !49
+ br label %if.end, !dbg !55
if.end: ; preds = %entry, %if.then
- tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !DIExpression()), !dbg !55
- tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !56, metadata !DIExpression()), !dbg !57
- tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !DIExpression()), !dbg !60
- %arraydecay.i = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61
- call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !61
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !DIExpression()), !dbg !63
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !64, metadata !DIExpression()), !dbg !65
- call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !DIExpression()), !dbg !67
- %arraydecay.i5 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68
- call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !68
- %tobool1 = icmp eq i32 %param1, 0, !dbg !69
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !DIExpression()), !dbg !63
- br i1 %tobool1, label %if.else, label %if.then2, !dbg !69
+ %0 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !56
+ call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !56
+ tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !32, metadata !57), !dbg !58
+ tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !36, metadata !46), !dbg !59
+ tail call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !38, metadata !46), !dbg !62
+ call void @_Z3fooPcjPKc(i8* %0, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !63
+ %1 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !65
+ call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !65
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !33, metadata !57), !dbg !66
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !36, metadata !46), !dbg !67
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !38, metadata !46), !dbg !69
+ call void @_Z3fooPcjPKc(i8* %1, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !70
+ %tobool1 = icmp eq i32 %param1, 0, !dbg !71
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !33, metadata !57), !dbg !66
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !41, metadata !46), !dbg !73
+ br i1 %tobool1, label %if.else, label %if.then2, !dbg !75
if.then2: ; preds = %if.end
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !71, metadata !DIExpression()), !dbg !73
- call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !DIExpression()), !dbg !76
- call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i64 0, i64 0)), !dbg !76
- br label %if.end3, !dbg !72
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0), i64 0, metadata !42, metadata !46), !dbg !76
+ call void @_Z3fooPcjPKc(i8* %1, i32 4, i8* nonnull getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0)), !dbg !78
+ br label %if.end3, !dbg !79
if.else: ; preds = %if.end
- call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !77, metadata !DIExpression()), !dbg !79
- call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !DIExpression()), !dbg !82
- call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0)), !dbg !82
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0), i64 0, metadata !42, metadata !46), !dbg !80
+ call void @_Z3fooPcjPKc(i8* %1, i32 4, i8* nonnull getelementptr inbounds ([2 x i8], [2 x i8]* @.str.2, i64 0, i64 0)), !dbg !81
br label %if.end3
if.end3: ; preds = %if.else, %if.then2
- call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !DIExpression()), !dbg !55
- call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !83, metadata !DIExpression()), !dbg !85
- call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !DIExpression()), !dbg !87
- call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !87
- ret void, !dbg !88
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !32, metadata !57), !dbg !58
+ call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !41, metadata !46), !dbg !82
+ call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !42, metadata !46), !dbg !84
+ call void @_Z3fooPcjPKc(i8* %0, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !85
+ call void @llvm.lifetime.end(i64 4, i8* %1) #4, !dbg !86
+ call void @llvm.lifetime.end(i64 4, i8* %0) #4, !dbg !87
+ ret void, !dbg !86
}
-declare i8* @_Z5i2stri(i32) #1
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1
+declare i8* @_Z5i2stri(i32) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare void @_Z3fooPcjPKc(i8*, i32, i8*) #2
; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+attributes #4 = { nounwind }
!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!44, !45}
-!llvm.ident = !{!46}
+!llvm.module.flags = !{!43, !44}
+!llvm.ident = !{!45}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !3, subprograms: !23, globals: !2, imports: !2)
-!1 = !DIFile(filename: "dbg-changes-codegen-branch-folding.cpp", directory: "/tmp/dbginfo")
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 255993) (llvm/trunk 256074)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.cpp", directory: "/mnt/extra")
!2 = !{}
!3 = !{!4}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "AAA3", line: 4, size: 32, align: 8, file: !1, elements: !5, identifier: "_ZTS4AAA3")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "AAA3", file: !1, line: 4, size: 32, align: 8, elements: !5, identifier: "_ZTS4AAA3")
!5 = !{!6, !11, !17, !18}
-!6 = !DIDerivedType(tag: DW_TAG_member, name: "text", line: 8, size: 32, align: 8, file: !1, scope: !"_ZTS4AAA3", baseType: !7)
-!7 = !DICompositeType(tag: DW_TAG_array_type, size: 32, align: 8, baseType: !8, elements: !9)
-!8 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+!6 = !DIDerivedType(tag: DW_TAG_member, name: "text", scope: !4, file: !1, line: 8, baseType: !7, size: 32, align: 8)
+!7 = !DICompositeType(tag: DW_TAG_array_type, baseType: !8, size: 32, align: 8, elements: !9)
+!8 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
!9 = !{!10}
!10 = !DISubrange(count: 4)
-!11 = !DISubprogram(name: "AAA3", line: 5, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !1, scope: !"_ZTS4AAA3", type: !12)
+!11 = !DISubprogram(name: "AAA3", scope: !4, file: !1, line: 5, type: !12, isLocal: false, isDefinition: false, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true)
!12 = !DISubroutineType(types: !13)
!13 = !{null, !14, !15}
-!14 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !"_ZTS4AAA3")
-!15 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !16)
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 64)
!16 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !8)
-!17 = !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", line: 6, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !"_ZTS4AAA3", type: !12)
-!18 = !DISubprogram(name: "operator const char *", linkageName: "_ZNK4AAA3cvPKcEv", line: 7, isLocal: false, isDefinition: false, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !1, scope: !"_ZTS4AAA3", type: !19)
+!17 = !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", scope: !4, file: !1, line: 6, type: !12, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true)
+!18 = !DISubprogram(name: "operator const char *", linkageName: "_ZNK4AAA3cvPKcEv", scope: !4, file: !1, line: 7, type: !19, isLocal: false, isDefinition: false, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: true)
!19 = !DISubroutineType(types: !20)
!20 = !{!15, !21}
-!21 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !22)
-!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !"_ZTS4AAA3")
-!23 = !{!24, !35, !40}
-!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 11, file: !1, scope: !25, type: !26, variables: !29)
-!25 = !DIFile(filename: "dbg-changes-codegen-branch-folding.cpp", directory: "/tmp/dbginfo")
-!26 = !DISubroutineType(types: !27)
-!27 = !{null, !28, !28}
-!28 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!29 = !{!30, !31, !32, !33, !34}
-!30 = !DILocalVariable(name: "param1", line: 11, arg: 1, scope: !24, file: !25, type: !28)
-!31 = !DILocalVariable(name: "param2", line: 11, arg: 2, scope: !24, file: !25, type: !28)
-!32 = !DILocalVariable(name: "temp", line: 12, scope: !24, file: !25, type: !15)
-!33 = !DILocalVariable(name: "var1", line: 17, scope: !24, file: !25, type: !"_ZTS4AAA3")
-!34 = !DILocalVariable(name: "var2", line: 18, scope: !24, file: !25, type: !"_ZTS4AAA3")
-!35 = distinct !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !17, variables: !36)
-!36 = !{!37, !39}
-!37 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!38 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !"_ZTS4AAA3")
-!39 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!40 = distinct !DISubprogram(name: "AAA3", linkageName: "_ZN4AAA3C2EPKc", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 5, file: !1, scope: !"_ZTS4AAA3", type: !12, declaration: !11, variables: !41)
-!41 = !{!42, !43}
-!42 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
-!43 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
-!44 = !{i32 2, !"Dwarf Version", i32 4}
-!45 = !{i32 2, !"Debug Info Version", i32 3}
-!46 = !{!"clang version 3.5.0 "}
-!47 = !DILocation(line: 11, scope: !24)
-!48 = !{i8* null}
-!49 = !DILocation(line: 12, scope: !24)
-!50 = !DILocation(line: 14, scope: !51)
-!51 = distinct !DILexicalBlock(line: 14, column: 0, file: !1, scope: !24)
-!52 = !DILocation(line: 15, scope: !53)
-!53 = distinct !DILexicalBlock(line: 14, column: 0, file: !1, scope: !51)
-!54 = !DILocation(line: 16, scope: !53)
-!55 = !DILocation(line: 17, scope: !24)
-!56 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
-!57 = !DILocation(line: 0, scope: !40, inlinedAt: !55)
-!58 = !{i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)}
-!59 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
-!60 = !DILocation(line: 5, scope: !40, inlinedAt: !55)
-!61 = !DILocation(line: 5, scope: !62, inlinedAt: !55)
-!62 = distinct !DILexicalBlock(line: 5, column: 0, file: !1, scope: !40)
-!63 = !DILocation(line: 18, scope: !24)
-!64 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !40, type: !38)
-!65 = !DILocation(line: 0, scope: !40, inlinedAt: !63)
-!66 = !DILocalVariable(name: "value", line: 5, arg: 2, scope: !40, file: !25, type: !15)
-!67 = !DILocation(line: 5, scope: !40, inlinedAt: !63)
-!68 = !DILocation(line: 5, scope: !62, inlinedAt: !63)
-!69 = !DILocation(line: 20, scope: !70)
-!70 = distinct !DILexicalBlock(line: 20, column: 0, file: !1, scope: !24)
-!71 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!72 = !DILocation(line: 21, scope: !70)
-!73 = !DILocation(line: 0, scope: !35, inlinedAt: !72)
-!74 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str1, i64 0, i64 0)}
-!75 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!76 = !DILocation(line: 6, scope: !35, inlinedAt: !72)
-!77 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!78 = !DILocation(line: 23, scope: !70)
-!79 = !DILocation(line: 0, scope: !35, inlinedAt: !78)
-!80 = !{i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str2, i64 0, i64 0)}
-!81 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!82 = !DILocation(line: 6, scope: !35, inlinedAt: !78)
-!83 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !35, type: !38)
-!84 = !DILocation(line: 24, scope: !24)
-!85 = !DILocation(line: 0, scope: !35, inlinedAt: !84)
-!86 = !DILocalVariable(name: "value", line: 6, arg: 2, scope: !35, file: !25, type: !15)
-!87 = !DILocation(line: 6, scope: !35, inlinedAt: !84)
-!88 = !DILocation(line: 25, scope: !24)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !4)
+!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", scope: !1, file: !1, line: 11, type: !25, isLocal: false, isDefinition: true, scopeLine: 11, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !28)
+!25 = !DISubroutineType(types: !26)
+!26 = !{null, !27, !27}
+!27 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!28 = !{!29, !30, !31, !32, !33}
+!29 = !DILocalVariable(name: "param1", arg: 1, scope: !24, file: !1, line: 11, type: !27)
+!30 = !DILocalVariable(name: "param2", arg: 2, scope: !24, file: !1, line: 11, type: !27)
+!31 = !DILocalVariable(name: "temp", scope: !24, file: !1, line: 12, type: !15)
+!32 = !DILocalVariable(name: "var1", scope: !24, file: !1, line: 17, type: !4)
+!33 = !DILocalVariable(name: "var2", scope: !24, file: !1, line: 18, type: !4)
+!34 = distinct !DISubprogram(name: "AAA3", linkageName: "_ZN4AAA3C2EPKc", scope: !4, file: !1, line: 5, type: !12, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !11, variables: !35)
+!35 = !{!36, !38}
+!36 = !DILocalVariable(name: "this", arg: 1, scope: !34, type: !37, flags: DIFlagArtificial | DIFlagObjectPointer)
+!37 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 64, align: 64)
+!38 = !DILocalVariable(name: "value", arg: 2, scope: !34, file: !1, line: 5, type: !15)
+!39 = distinct !DISubprogram(name: "operator=", linkageName: "_ZN4AAA3aSEPKc", scope: !4, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !17, variables: !40)
+!40 = !{!41, !42}
+!41 = !DILocalVariable(name: "this", arg: 1, scope: !39, type: !37, flags: DIFlagArtificial | DIFlagObjectPointer)
+!42 = !DILocalVariable(name: "value", arg: 2, scope: !39, file: !1, line: 6, type: !15)
+!43 = !{i32 2, !"Dwarf Version", i32 4}
+!44 = !{i32 2, !"Debug Info Version", i32 3}
+!45 = !{!"clang version 3.8.0 (trunk 255993) (llvm/trunk 256074)"}
+!46 = !DIExpression()
+!47 = !DILocation(line: 11, column: 15, scope: !24)
+!48 = !DILocation(line: 11, column: 26, scope: !24)
+!49 = !DILocation(line: 12, column: 16, scope: !24)
+!50 = !DILocation(line: 14, column: 7, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !24, file: !1, line: 14, column: 7)
+!52 = !DILocation(line: 14, column: 7, scope: !24)
+!53 = !DILocation(line: 15, column: 12, scope: !54)
+!54 = distinct !DILexicalBlock(scope: !51, file: !1, line: 14, column: 15)
+!55 = !DILocation(line: 16, column: 3, scope: !54)
+!56 = !DILocation(line: 17, column: 3, scope: !24)
+!57 = !DIExpression(DW_OP_deref)
+!58 = !DILocation(line: 17, column: 8, scope: !24)
+!59 = !DILocation(line: 0, scope: !34, inlinedAt: !60)
+!60 = distinct !DILocation(line: 17, column: 8, scope: !61)
+!61 = !DILexicalBlockFile(scope: !24, file: !1, discriminator: 1)
+!62 = !DILocation(line: 5, column: 19, scope: !34, inlinedAt: !60)
+!63 = !DILocation(line: 5, column: 28, scope: !64, inlinedAt: !60)
+!64 = distinct !DILexicalBlock(scope: !34, file: !1, line: 5, column: 26)
+!65 = !DILocation(line: 18, column: 3, scope: !24)
+!66 = !DILocation(line: 18, column: 8, scope: !24)
+!67 = !DILocation(line: 0, scope: !34, inlinedAt: !68)
+!68 = distinct !DILocation(line: 18, column: 8, scope: !61)
+!69 = !DILocation(line: 5, column: 19, scope: !34, inlinedAt: !68)
+!70 = !DILocation(line: 5, column: 28, scope: !64, inlinedAt: !68)
+!71 = !DILocation(line: 20, column: 7, scope: !72)
+!72 = distinct !DILexicalBlock(scope: !24, file: !1, line: 20, column: 7)
+!73 = !DILocation(line: 0, scope: !39, inlinedAt: !74)
+!74 = distinct !DILocation(line: 23, column: 10, scope: !72)
+!75 = !DILocation(line: 20, column: 7, scope: !24)
+!76 = !DILocation(line: 6, column: 29, scope: !39, inlinedAt: !77)
+!77 = distinct !DILocation(line: 21, column: 10, scope: !72)
+!78 = !DILocation(line: 6, column: 38, scope: !39, inlinedAt: !77)
+!79 = !DILocation(line: 21, column: 5, scope: !72)
+!80 = !DILocation(line: 6, column: 29, scope: !39, inlinedAt: !74)
+!81 = !DILocation(line: 6, column: 38, scope: !39, inlinedAt: !74)
+!82 = !DILocation(line: 0, scope: !39, inlinedAt: !83)
+!83 = distinct !DILocation(line: 24, column: 8, scope: !24)
+!84 = !DILocation(line: 6, column: 29, scope: !39, inlinedAt: !83)
+!85 = !DILocation(line: 6, column: 38, scope: !39, inlinedAt: !83)
+!86 = !DILocation(line: 25, column: 1, scope: !24)
+!87 = !DILocation(line: 25, column: 1, scope: !61)
diff --git a/test/CodeGen/X86/dbg-combine.ll b/test/CodeGen/X86/dbg-combine.ll
index 3e78c316a06f..3a44fe186f97 100644
--- a/test/CodeGen/X86/dbg-combine.ll
+++ b/test/CodeGen/X86/dbg-combine.ll
@@ -74,11 +74,10 @@ attributes #2 = { nounwind }
!llvm.module.flags = !{!9, !10}
!llvm.ident = !{!11}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.7.0 (trunk 227074)", isOptimized: false, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.7.0 (trunk 227074)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
!1 = !DIFile(filename: "dbg-combine.c", directory: "/home/probinson/projects/scratch")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, isOptimized: false, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, isOptimized: false, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2)
!5 = !DIFile(filename: "dbg-combine.c", directory: "/home/probinson/projects/scratch")
!6 = !DISubroutineType(types: !7)
!7 = !{!8}
diff --git a/test/CodeGen/X86/debugloc-argsize.ll b/test/CodeGen/X86/debugloc-argsize.ll
index 0283154abab2..75a791757c01 100644
--- a/test/CodeGen/X86/debugloc-argsize.ll
+++ b/test/CodeGen/X86/debugloc-argsize.ll
@@ -38,11 +38,10 @@ attributes #2 = { nounwind }
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 249520)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 249520)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "foo.cpp", directory: "foo")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, variables: !2)
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/CodeGen/X86/deopt-bundles.ll b/test/CodeGen/X86/deopt-bundles.ll
new file mode 100644
index 000000000000..1fb73ea252ee
--- /dev/null
+++ b/test/CodeGen/X86/deopt-bundles.ll
@@ -0,0 +1,161 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 -debug-only=stackmaps < %s 2>&1 | FileCheck -check-prefix=STACKMAPS %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+
+; STACKMAPS: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 4242
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 4243
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 16 [encoding: .byte 4, .byte 8, .short 0, .int 16]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 2 [encoding: .byte 4, .byte 8, .short 0, .int 2]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 3 [encoding: .byte 4, .byte 8, .short 0, .int 3]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 4243
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 55 [encoding: .byte 4, .byte 8, .short 0, .int 55]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+
+
+declare i32 @callee_0()
+declare i32 @callee_1(i32)
+declare i32 @callee_vararg(...)
+
+define i32 @caller_0() {
+; CHECK-LABEL: _caller_0
+entry:
+ %v = call i32 @callee_0() [ "deopt"(i32 0) ]
+ %v2 = add i32 %v, 1
+ ret i32 %v2
+; CHECK: callq _callee_0
+; CHECK: incl %eax
+; CHECK: retq
+}
+
+define i32 @caller_1() {
+; CHECK-LABEL: _caller_1
+entry:
+ %v = call i32 @callee_1(i32 42) "statepoint-id"="4242" [ "deopt"(i32 1) ]
+ ret i32 %v
+; CHECK: callq _callee_1
+; CHECK: popq %rcx
+; CHECK: retq
+}
+
+define i32 @caller_vararg() {
+; CHECK-LABEL: _caller_vararg
+entry:
+; CHECK: movb $1, %al
+; CHECK: callq _callee_vararg
+ %v = call i32(...) @callee_vararg(i32 42, double 500.0) "statepoint-id"="4243" [ "deopt"(i32 16) ]
+ ret i32 %v
+}
+
+define i32 @invoker_0() personality i8 0 {
+; CHECK-LABEL: _invoker_0
+entry:
+ %v = invoke i32 @callee_0() [ "deopt"(i32 2) ]
+ to label %normal unwind label %uw
+
+normal:
+ ret i32 %v
+
+uw:
+ %ehvals = landingpad { i8*, i32 }
+ cleanup
+ ret i32 1
+; CHECK: callq _callee_0
+; CHECK: popq %rcx
+; CHECK: retq
+; CHECK: movl $1, %eax
+; CHECK: popq %rcx
+; CHECK: retq
+}
+
+define i32 @invoker_1() personality i8 0 {
+; CHECK-LABEL: _invoker_1
+entry:
+ %v = invoke i32 @callee_1(i32 45) "statepoint-num-patch-bytes"="9" [ "deopt"(i32 3) ]
+ to label %normal unwind label %uw
+
+normal:
+ ret i32 %v
+
+uw:
+ %ehvals = landingpad { i8*, i32 }
+ cleanup
+ ret i32 1
+; CHECK: movl $45, %edi
+; CHECK: nopw 512(%rax,%rax)
+; CHECK: popq %rcx
+; CHECK: retq
+; CHECK: movl $1, %eax
+; CHECK: popq %rcx
+; CHECK: retq
+}
+
+define i32 @invoker_2() personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+ %val = invoke i32 @callee_1(i32 1)
+ to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:
+ %cs1 = catchswitch within none [label %catch] unwind to caller
+
+catch:
+ %cp1 = catchpad within %cs1 [i8* null, i32 64, i8* null]
+ %val2 = call i32 @callee_1(i32 100) "statepoint-id"="4243" [ "funclet"(token %cp1), "deopt"(i32 55) ]
+ catchret from %cp1 to label %try.cont
+
+try.cont:
+ ret i32 0
+}
+
+declare i32 @__CxxFrameHandler3(...)
+
+define void @f_0(i64 %n) {
+ ; CHECK-LABEL: _f_0
+ %s = alloca i64
+ %vl = alloca i64, i64 %n
+ ; Check that the stackmap does not reference %s through
+ ; SP since the offset is not static because of %vl.
+ ; STACKMAPS: Loc 3: Direct 6
+ call void @g_0(i64* %vl) [ "deopt"(i64* %s) ]
+ ret void
+}
+
+declare void @g_0(i64* %vl)
diff --git a/test/CodeGen/X86/deopt-intrinsic-cconv.ll b/test/CodeGen/X86/deopt-intrinsic-cconv.ll
new file mode 100644
index 000000000000..8e240f8901d8
--- /dev/null
+++ b/test/CodeGen/X86/deopt-intrinsic-cconv.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -debug-only=stackmaps < %s 2>&1 | FileCheck --check-prefix=STACKMAPS %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare webkit_jscc i64 @llvm.experimental.deoptimize.i64(...)
+
+define i64 @caller_1() {
+; CHECK-LABEL: _caller_1:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: ##{{.+}}
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: movl $1140457472, (%rsp) ## imm = 0x43FA0000
+; CHECK-NEXT: movl $42, %eax
+; CHECK-NEXT: callq ___llvm_deoptimize
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+
+entry:
+ %v = call webkit_jscc i64(...) @llvm.experimental.deoptimize.i64(i32 42, float 500.0) [ "deopt"(i32 3) ]
+ ret i64 %v
+}
+
+; STACKMAPS: Stack Maps: callsites:
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 12 [encoding: .byte 4, .byte 8, .short 0, .int 12]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 3 [encoding: .byte 4, .byte 8, .short 0, .int 3]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
diff --git a/test/CodeGen/X86/deopt-intrinsic.ll b/test/CodeGen/X86/deopt-intrinsic.ll
new file mode 100644
index 000000000000..ceed2d248821
--- /dev/null
+++ b/test/CodeGen/X86/deopt-intrinsic.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -debug-only=stackmaps < %s 2>&1 | FileCheck --check-prefix=STACKMAPS %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare i32 @llvm.experimental.deoptimize.i32(...)
+declare i8 @llvm.experimental.deoptimize.i8(...)
+
+define i32 @caller_0() {
+; CHECK-LABEL: _caller_0:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: ##{{.+}}
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: callq ___llvm_deoptimize
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+entry:
+ %v = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0) ]
+ ret i32 %v
+}
+
+define i8 @caller_1() {
+; CHECK-LABEL: _caller_1:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: ##{{.+}}
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+; CHECK-NEXT: {{.+cfi.+}}
+; CHECK-NEXT: movss {{[a-zA-Z0-9_]+}}(%rip), %xmm0 ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movl $42, %edi
+; CHECK-NEXT: callq ___llvm_deoptimize
+; CHECK-NEXT: {{Ltmp[0-9]+}}:
+
+entry:
+ %v = call i8(...) @llvm.experimental.deoptimize.i8(i32 42, float 500.0) [ "deopt"(i32 1) ]
+ ret i8 %v
+}
+
+; STACKMAPS: Stack Maps: callsites:
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
+; STACKMAPS-NEXT: Stack Maps: callsite 2882400015
+; STACKMAPS-NEXT: Stack Maps: has 4 locations
+; STACKMAPS-NEXT: Stack Maps: Loc 0: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 1: Constant 0 [encoding: .byte 4, .byte 8, .short 0, .int 0]
+; STACKMAPS-NEXT: Stack Maps: Loc 2: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: Loc 3: Constant 1 [encoding: .byte 4, .byte 8, .short 0, .int 1]
+; STACKMAPS-NEXT: Stack Maps: has 0 live-out registers
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index 58e25f923971..e45f3ba91495 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=NOTEXPORTED %s
+; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck -check-prefix=NOTEXPORTED %s
; CHECK: .text
@@ -50,11 +52,19 @@ define weak_odr dllexport void @weak1() {
; CHECK: .globl WeakVar2
@WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
+; CHECK: .bss
+; CHECK: .globl WeakVar3
+@WeakVar3 = weak_odr dllexport global i32 0, align 4
+
; CHECK: .globl alias
; CHECK: alias = notExported
@alias = dllexport alias void(), void()* @notExported
+; CHECK: .globl aliasNotExported
+; CHECK: aliasNotExported = f1
+@aliasNotExported = alias void(), void()* @f1
+
; CHECK: .globl alias2
; CHECK: alias2 = f1
@alias2 = dllexport alias void(), void()* @f1
@@ -70,6 +80,23 @@ define weak_odr dllexport void @weak1() {
@blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
@blob_alias = dllexport alias i32 (), bitcast ([6 x i8]* @blob to i32 ()*)
+@exportedButNotDefinedVariable = external dllexport global i32
+declare dllexport void @exportedButNotDefinedFunction()
+define void @foo() {
+entry:
+ store i32 4, i32* @exportedButNotDefinedVariable, align 4
+ call void @exportedButNotDefinedFunction()
+ ret void
+}
+
+; Verify items that should not be exported do not appear in the export table.
+; We use a separate check prefix to avoid confusion between -NOT and -SAME.
+; NOTEXPORTED: .section .drectve
+; NOTEXPORTED-NOT: notExported
+; NOTEXPORTED-NOT: aliasNotExported
+; NOTEXPORTED-NOT: exportedButNotDefinedVariable
+; NOTEXPORTED-NOT: exportedButNotDefinedFunction
+
; CHECK: .section .drectve
; WIN32: /EXPORT:f1
; WIN32-SAME: /EXPORT:f2
@@ -81,6 +108,7 @@ define weak_odr dllexport void @weak1() {
; WIN32-SAME: /EXPORT:Var3,DATA
; WIN32-SAME: /EXPORT:WeakVar1,DATA
; WIN32-SAME: /EXPORT:WeakVar2,DATA
+; WIN32-SAME: /EXPORT:WeakVar3,DATA
; WIN32-SAME: /EXPORT:alias
; WIN32-SAME: /EXPORT:alias2
; WIN32-SAME: /EXPORT:alias3
@@ -96,6 +124,7 @@ define weak_odr dllexport void @weak1() {
; MINGW-SAME: -export:Var3,data
; MINGW-SAME: -export:WeakVar1,data
; MINGW-SAME: -export:WeakVar2,data
+; MINGW-SAME: -export:WeakVar3,data
; MINGW-SAME: -export:alias
; MINGW-SAME: -export:alias2
; MINGW-SAME: -export:alias3
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index cde0955410b7..d833f3c22ffc 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -4,6 +4,12 @@
; RUN: | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
; RUN: llc -mtriple i686-pc-cygwin %s -o - \
; RUN: | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
+; RUN: llc -mtriple i386-pc-win32 < %s \
+; RUN: | FileCheck -check-prefix NOTEXPORTED %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s \
+; RUN: | FileCheck -check-prefix NOTEXPORTED %s
+; RUN: llc -mtriple i686-pc-cygwin %s -o - \
+; RUN: | FileCheck -check-prefix NOTEXPORTED %s
; CHECK: .text
@@ -21,7 +27,7 @@ define dllexport void @f2() unnamed_addr {
ret void
}
-declare dllexport void @not_defined()
+declare dllexport void @notDefined()
; CHECK: .globl _stdfun@0
define dllexport x86_stdcallcc void @stdfun() nounwind {
@@ -88,8 +94,13 @@ define weak_odr dllexport void @weak1() {
; CHECK: _weak_alias = _f1
@weak_alias = weak_odr dllexport alias void(), void()* @f1
+; Verify items that should not be exported do not appear in the export table.
+; We use a separate check prefix to avoid confusion between -NOT and -SAME.
+; NOTEXPORTED: .section .drectve
+; NOTEXPORTED-NOT: notExported
+; NOTEXPORTED-NOT: notDefined
+
; CHECK: .section .drectve
-; CHECK-CL-NOT: not_exported
; CHECK-CL: /EXPORT:_f1
; CHECK-CL-SAME: /EXPORT:_f2
; CHECK-CL-SAME: /EXPORT:_stdfun@0
@@ -107,8 +118,6 @@ define weak_odr dllexport void @weak1() {
; CHECK-CL-SAME: /EXPORT:_alias2
; CHECK-CL-SAME: /EXPORT:_alias3
; CHECK-CL-SAME: /EXPORT:_weak_alias"
-; CHECK-CL-NOT: not_exported
-; CHECK-GCC-NOT: not_exported
; CHECK-GCC: -export:f1
; CHECK-GCC-SAME: -export:f2
; CHECK-GCC-SAME: -export:stdfun@0
@@ -126,4 +135,3 @@ define weak_odr dllexport void @weak1() {
; CHECK-GCC-SAME: -export:alias2
; CHECK-GCC-SAME: -export:alias3
; CHECK-GCC-SAME: -export:weak_alias"
-; CHECK-GCC-NOT: not_exported
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index 31d2724aade3..b744a70288e5 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!5}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: 0, file: !4, enums: !2, retainedTypes: !7, subprograms: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !7, globals: !2)
!2 = !{}
!3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
!4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
diff --git a/test/CodeGen/X86/dynamic-alloca-in-entry.ll b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
index 7ed471c2f502..2b5721d7fcf1 100644
--- a/test/CodeGen/X86/dynamic-alloca-in-entry.ll
+++ b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
@@ -15,5 +15,5 @@ define void @bar() {
ret void
}
; CHECK-LABEL: _bar:
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
; CHECK: retl
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
index b0334d6a63ef..71e589275ede 100644
--- a/test/CodeGen/X86/dynamic-allocas-VLAs.ll
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
; rdar://11496434
; no VLAs or dynamic alignment
@@ -60,12 +60,10 @@ entry:
; CHECK: _t3
; CHECK: pushq %rbp
; CHECK: movq %rsp, %rbp
-; CHECK: pushq %rbx
; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
; CHECK: subq ${{[0-9]+}}, %rsp
;
-; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp
-; CHECK: popq %rbx
+; CHECK: movq %rbp, %rsp
; CHECK: popq %rbp
}
@@ -85,7 +83,6 @@ entry:
; CHECK: _t4
; CHECK: pushq %rbp
; CHECK: movq %rsp, %rbp
-; CHECK: pushq %r14
; CHECK: pushq %rbx
; CHECK: andq $-32, %rsp
; CHECK: subq ${{[0-9]+}}, %rsp
@@ -95,9 +92,8 @@ entry:
; CHECK: leaq {{[0-9]*}}(%rbx), %rdx
; CHECK: callq _t4_helper
;
-; CHECK: leaq -16(%rbp), %rsp
+; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp
; CHECK: popq %rbx
-; CHECK: popq %r14
; CHECK: popq %rbp
}
diff --git a/test/CodeGen/X86/eflags-copy-expansion.mir b/test/CodeGen/X86/eflags-copy-expansion.mir
new file mode 100644
index 000000000000..bf2d0be67c12
--- /dev/null
+++ b/test/CodeGen/X86/eflags-copy-expansion.mir
@@ -0,0 +1,67 @@
+# RUN: llc -run-pass postrapseudos -mtriple=i386-apple-macosx -o - %s | FileCheck %s
+
+# Verify that we correctly save and restore eax when copying eflags,
+# even when only a smaller alias of eax is used. We used to check only
+# eax and not its aliases.
+# PR27624.
+
+--- |
+ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+ define void @foo() {
+ entry:
+ br label %false
+ false:
+ ret void
+ }
+
+...
+
+---
+name: foo
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0.entry:
+ liveins: %edi
+ successors: %bb.1.false
+ NOOP implicit-def %al
+
+ ; The bug was triggered only when LivePhysReg is used, which
+ ; happens only when the heuristic for the liveness computation
+ ; failed. The liveness computation heuristic looks at 10 instructions
+ ; before and after the copy. Make sure we do not reach the definition of
+ ; AL in 10 instructions, otherwise the heuristic will see that it is live.
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ NOOP
+ ; Save AL.
+ ; CHECK: PUSH32r killed %eax
+
+ ; Copy EDI into EFLAGS
+ ; CHECK-NEXT: %eax = MOV32rr %edi
+ ; CHECK-NEXT: %al = ADD8ri %al, 127, implicit-def %eflags
+ ; CHECK-NEXT: SAHF implicit-def %eflags, implicit %ah
+ %eflags = COPY %edi
+
+ ; Restore AL.
+ ; CHECK-NEXT: %eax = POP32r
+ bb.1.false:
+ liveins: %al
+ NOOP implicit %al
+ RETQ
+
+...
diff --git a/test/CodeGen/X86/emutls-pic.ll b/test/CodeGen/X86/emutls-pic.ll
index 11676aff1892..50dc72653aea 100644
--- a/test/CodeGen/X86/emutls-pic.ll
+++ b/test/CodeGen/X86/emutls-pic.ll
@@ -82,28 +82,29 @@ entry:
}
; X32-LABEL: f5:
-; X32: movl __emutls_v.j@GOT(%ebx), %eax
+; X32: leal __emutls_v.j@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: movl (%eax), %esi
-; X32-NEXT: movl __emutls_v.k@GOT(%ebx), %eax
+; X32-NEXT: leal __emutls_v.k@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: addl (%eax), %esi
; X32-NEXT: movl %esi, %eax
; X64-LABEL: f5:
-; X64: movq __emutls_v.j@GOTPCREL(%rip), %rdi
+; X64: leaq __emutls_v.j(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: movl (%rax), %ebx
-; X64-NEXT: movq __emutls_v.k@GOTPCREL(%rip), %rdi
+; X64-NEXT: leaq __emutls_v.k(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: addl (%rax), %ebx
; X64-NEXT: movl %ebx, %eax
;;;;; 32-bit targets
-; X32: .data
+; X32: .data{{$}}
+; X32: .globl __emutls_v.i
; X32-LABEL: __emutls_v.i:
; X32-NEXT: .long 4
; X32-NEXT: .long 4
@@ -114,7 +115,8 @@ entry:
; X32-LABEL: __emutls_t.i:
; X32-NEXT: .long 15
-; X32: .data
+; X32: .data{{$}}
+; X32-NOT: .globl
; X32-LABEL: __emutls_v.j:
; X32-NEXT: .long 4
; X32-NEXT: .long 4
@@ -125,7 +127,8 @@ entry:
; X32-LABEL: __emutls_t.j:
; X32-NEXT: .long 42
-; X32: .data
+; X32: .data{{$}}
+; X32-NOT: .globl
; X32-LABEL: __emutls_v.k:
; X32-NEXT: .long 4
; X32-NEXT: .long 8
@@ -136,7 +139,8 @@ entry:
;;;;; 64-bit targets
-; X64: .data
+; X64: .data{{$}}
+; X64: .globl __emutls_v.i
; X64-LABEL: __emutls_v.i:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
@@ -147,7 +151,8 @@ entry:
; X64-LABEL: __emutls_t.i:
; X64-NEXT: .long 15
-; X64: .data
+; X64: .data{{$}}
+; X64-NOT: .globl
; X64-LABEL: __emutls_v.j:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
@@ -158,7 +163,8 @@ entry:
; X64-LABEL: __emutls_t.j:
; X64-NEXT: .long 42
-; X64: .data
+; X64: .data{{$}}
+; X64-NOT: .globl
; X64-LABEL: __emutls_v.k:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 8
diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll
index 45e5c38c0d8a..5db8c888a4e4 100644
--- a/test/CodeGen/X86/emutls-pie.ll
+++ b/test/CodeGen/X86/emutls-pie.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-android -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-android -relocation-model=pic -enable-pie \
+; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
; Use my_emutls_get_address like __emutls_get_address.
@@ -39,7 +39,7 @@ entry:
define i32 @f1() {
; X32-LABEL: f1:
-; X32: movl __emutls_v.i@GOT(%ebx), %eax
+; X32: leal __emutls_v.i@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
@@ -47,7 +47,7 @@ define i32 @f1() {
; X32-NEXT: popl %ebx
; X32-NEXT: retl
; X64-LABEL: f1:
-; X64: movq __emutls_v.i@GOTPCREL(%rip), %rdi
+; X64: leaq __emutls_v.i(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
@@ -60,11 +60,11 @@ entry:
define i32* @f2() {
; X32-LABEL: f2:
-; X32: movl __emutls_v.i@GOT(%ebx), %eax
+; X32: leal __emutls_v.i@GOTOFF(%ebx), %eax
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: calll __emutls_get_address@PLT
; X64-LABEL: f2:
-; X64: movq __emutls_v.i@GOTPCREL(%rip), %rdi
+; X64: leaq __emutls_v.i(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
entry:
@@ -129,3 +129,8 @@ entry:
; X64-NOT: __emutls_v.i2
; X64-NOT: __emutls_t.i2
+
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/emutls_generic.ll b/test/CodeGen/X86/emutls_generic.ll
index b99a195426c2..16d90001426f 100644
--- a/test/CodeGen/X86/emutls_generic.ll
+++ b/test/CodeGen/X86/emutls_generic.ll
@@ -45,18 +45,19 @@ entry:
; CHECK: __emutls_t.internal_y
; X86_32-LABEL: get_external_x:
-; X86_32: movl __emutls_v.external_x
+; X86_32: movl __emutls_v.external_x@GOT(%ebx)
; X86_32: calll __emutls_get_address
; X86_32-LABEL: get_external_y:
-; X86_32: movl __emutls_v.external_y
+; X86_32: movl __emutls_v.external_y@GOT(%ebx)
; X86_32: calll __emutls_get_address
; X86_32-LABEL: get_internal_y:
-; X86_32: movl __emutls_v.internal_y
-; X86_32: calll __emutls_get_address
-; X86_32-NOT: __emutls_t.external_x
-; X86_32-NOT: __emutls_v.external_x:
-; X86_32: .data
-; X86_32: .align 4
+; X86_32: leal __emutls_v.internal_y@GOTOFF(%ebx)
+; X86_32: calll __emutls_get_address
+; X86_32-NOT: __emutls_t.external_x
+; X86_32-NOT: __emutls_v.external_x:
+; X86_32: .data{{$}}
+; X86_32: .globl __emutls_v.external_y
+; X86_32: .p2align 2
; X86_32-LABEL: __emutls_v.external_y:
; X86_32-NEXT: .long 1
; X86_32-NEXT: .long 2
@@ -65,8 +66,9 @@ entry:
; X86_32: .section .rodata,
; X86_32-LABEL: __emutls_t.external_y:
; X86_32-NEXT: .byte 7
-; X86_32: .data
-; X86_32: .align 4
+; X86_32: .data{{$}}
+; X86_32-NOT: .globl
+; X86_32: .p2align 2
; X86_32-LABEL: __emutls_v.internal_y:
; X86_32-NEXT: .long 8
; X86_32-NEXT: .long 16
@@ -75,17 +77,18 @@ entry:
; X86_32-LABEL: __emutls_t.internal_y:
; X86_32-NEXT: .quad 9
; X86_64-LABEL: get_external_x:
-; X86_64: __emutls_v.external_x
-; X86_64: __emutls_get_address
+; X86_64: __emutls_v.external_x@GOTPCREL(%rip)
+; X86_64: __emutls_get_address
; X86_64-LABEL: get_external_y:
-; X86_64: __emutls_v.external_y
-; X86_64: __emutls_get_address
+; X86_64: __emutls_v.external_y@GOTPCREL(%rip)
+; X86_64: __emutls_get_address
; X86_64-LABEL: get_internal_y:
-; X86_64: __emutls_v.internal_y
-; X86_64: __emutls_get_address
-; X86_64-NOT: __emutls_t.external_x
-; X86_64-NOT: __emutls_v.external_x:
-; X86_64: .align 8
+; X86_64: __emutls_v.internal_y(%rip)
+; X86_64: __emutls_get_address
+; X86_64-NOT: __emutls_t.external_x
+; X86_64-NOT: __emutls_v.external_x:
+; X86_64: .globl __emutls_v.external_y
+; X86_64: .p2align 3
; X86_64-LABEL: __emutls_v.external_y:
; X86_64-NEXT: .quad 1
; X86_64-NEXT: .quad 2
@@ -95,8 +98,9 @@ entry:
; X86_64: .section .rodata,
; X86_64-LABEL: __emutls_t.external_y:
; X86_64-NEXT: .byte 7
-; X86_64: .data
-; X86_64: .align 8
+; X86_64: .data{{$}}
+; X86_64-NOT: .globl
+; X86_64: .p2align 3
; X86_64-LABEL: __emutls_v.internal_y:
; X86_64-NEXT: .quad 8
; X86_64-NEXT: .quad 16
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
index ab92fe0d1d0c..992b3a395e7b 100644
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -1,13 +1,16 @@
-; RUN: llc -O3 -mtriple=x86_64-apple-macosx -o - < %s -mattr=+avx2 -enable-unsafe-fp-math -mcpu=core2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 -enable-unsafe-fp-math | FileCheck %s
+
; Check that the ExeDepsFix pass correctly fixes the domain for broadcast instructions.
; <rdar://problem/16354675>
-; CHECK-LABEL: ExeDepsFix_broadcastss
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastss:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x float> %arg to <4 x i32>
%and = and <4 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
%floatcast = bitcast <4 x i32> %and to <4 x float>
@@ -16,12 +19,13 @@ define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2)
ret <4 x float> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastss256
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastss256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <8 x float> %arg to <8 x i32>
%and = and <8 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
%floatcast = bitcast <8 x i32> %and to <8 x float>
@@ -30,13 +34,14 @@ define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg
ret <8 x float> %max
}
-
-; CHECK-LABEL: ExeDepsFix_broadcastss_inreg
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %arg2, i32 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastss_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm2
+; CHECK-NEXT: vbroadcastss %xmm2, %xmm2
+; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x float> %arg to <4 x i32>
%in = insertelement <4 x i32> undef, i32 %broadcastvalue, i32 0
%mask = shufflevector <4 x i32> %in, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -47,12 +52,14 @@ define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %
ret <4 x float> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg
-; CHECK: broadcastss
-; CHECK: vandps
-; CHECK: vmaxps
-; CHECK: ret
define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float> %arg2, i32 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovd %edi, %xmm2
+; CHECK-NEXT: vbroadcastss %xmm2, %ymm2
+; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <8 x float> %arg to <8 x i32>
%in = insertelement <8 x i32> undef, i32 %broadcastvalue, i32 0
%mask = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -63,12 +70,13 @@ define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float
ret <8 x float> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastsd
; In that case the broadcast is directly folded into vandpd.
-; CHECK: vandpd
-; CHECK: vmaxpd
-; CHECK:ret
define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <2 x double> %arg to <2 x i64>
%and = and <2 x i64> %bitcast, <i64 2147483647, i64 2147483647>
%floatcast = bitcast <2 x i64> %and to <2 x double>
@@ -77,12 +85,13 @@ define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg
ret <2 x double> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastsd256
-; CHECK: broadcastsd
-; CHECK: vandpd
-; CHECK: vmaxpd
-; CHECK: ret
define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %arg2) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
+; CHECK-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x double> %arg to <4 x i64>
%and = and <4 x i64> %bitcast, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
%floatcast = bitcast <4 x i64> %and to <4 x double>
@@ -91,16 +100,16 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
ret <4 x double> %max
}
-
-; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
; vpand and there is nothing more you can do to match vmaxpd.
-; CHECK: vmovq
-; CHECK: vpbroadcastq
-; CHECK: vpand
-; CHECK: vmaxpd
-; CHECK: ret
define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm2
+; CHECK-NEXT: vpbroadcastq %xmm2, %xmm2
+; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%bitcast = bitcast <2 x double> %arg to <2 x i64>
%in = insertelement <2 x i64> undef, i64 %broadcastvalue, i32 0
%mask = shufflevector <2 x i64> %in, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -111,12 +120,14 @@ define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double
ret <2 x double> %max
}
-; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg
-; CHECK: broadcastsd
-; CHECK: vandpd
-; CHECK: vmaxpd
-; CHECK: ret
define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x double> %arg2, i64 %broadcastvalue) {
+; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovq %rdi, %xmm2
+; CHECK-NEXT: vbroadcastsd %xmm2, %ymm2
+; CHECK-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%bitcast = bitcast <4 x double> %arg to <4 x i64>
%in = insertelement <4 x i64> undef, i64 %broadcastvalue, i32 0
%mask = shufflevector <4 x i64> %in, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -126,4 +137,3 @@ define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x dou
%max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
ret <4 x double> %max
}
-
diff --git a/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
index 8ce1c7eaae70..3598c045ad53 100644
--- a/test/CodeGen/X86/expand-vr64-gr64-copy.mir
+++ b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+3dnow -o /dev/null %s | FileCheck %s
+# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+3dnow -o - %s | FileCheck %s
# This test verifies that the ExpandPostRA pass expands the GR64 <-> VR64
# copies into appropriate MMX_MOV instructions.
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index ab3ff8ed435e..eb7cdb6b57be 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -1,51 +1,636 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+;
+; ExtractElement - Constant Index
+;
-; CHECK-LABEL: extractelement_index_1:
-define i8 @extractelement_index_1(<32 x i8> %a) nounwind {
- ; X64: movaps
- ; AVX: vpextrb $1
+define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v16i8_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v16i8_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $1, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 1
+ ret i8 %b
+}
+
+define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v16i8_11:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v16i8_11:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $11, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_11:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 11
+ ret i8 %b
+}
+
+define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v16i8_14:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v16i8_14:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $14, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_14:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 14
+ ret i8 %b
+}
+
+define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v32i8_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v32i8_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $1, %xmm0, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v32i8_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 1
ret i8 %b
}
-; CHECK-LABEL: extractelement_index_2:
-define i32 @extractelement_index_2(<8 x i32> %a) nounwind {
- ; X64: pshufd
- ; AVX: vextractf128 $1
- ; AVX-NEXT: vpextrd $3
- %b = extractelement <8 x i32> %a, i64 7
+define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind {
+; SSE2-LABEL: extractelement_v32i8_17:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v32i8_17:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrb $1, %xmm1, %eax
+; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v32i8_17:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v32i8_17:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <32 x i8> %a, i256 17
+ ret i8 %b
+}
+
+define i16 @extractelement_v8i16_0(<8 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i16_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i16_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <8 x i16> %a, i256 0
+ ret i16 %b
+}
+
+define i16 @extractelement_v8i16_3(<8 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i16_3:
+; SSE: # BB#0:
+; SSE-NEXT: pextrw $3, %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i16_3:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrw $3, %xmm0, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: retq
+ %b = extractelement <8 x i16> %a, i256 3
+ ret i16 %b
+}
+
+define i16 @extractelement_v16i16_0(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i16_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 0
+ ret i16 %b
+}
+
+define i16 @extractelement_v16i16_13(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_13:
+; SSE: # BB#0:
+; SSE-NEXT: pextrw $5, %xmm1, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v16i16_13:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrw $5, %xmm0, %eax
+; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v16i16_13:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrw $5, %xmm0, %eax
+; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 13
+ ret i16 %b
+}
+
+define i32 @extractelement_v4i32_0(<4 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v4i32_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i32_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
+ %b = extractelement <4 x i32> %a, i256 0
ret i32 %b
}
-; CHECK-LABEL: extractelement_index_3:
-define i32 @extractelement_index_3(<8 x i32> %a) nounwind {
- ; CHECK-NOT: pextr
- %b = extractelement <8 x i32> %a, i64 15
+define i32 @extractelement_v4i32_3(<4 x i32> %a) nounwind {
+; SSE2-LABEL: extractelement_v4i32_3:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v4i32_3:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i32_3:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: retq
+ %b = extractelement <4 x i32> %a, i256 3
+ ret i32 %b
+}
+
+define i32 @extractelement_v8i32_0(<8 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v8i32_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i32_0:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <8 x i32> %a, i256 4
ret i32 %b
}
-; CHECK-LABEL: extractelement_index_4:
-define i32 @extractelement_index_4(<8 x i32> %a) nounwind {
- ; X64: movd
- ; AVX: vextractf128 $1
- ; AVX-NEXT: vmovd
+define i32 @extractelement_v8i32_4(<8 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v8i32_4:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm1, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i32_4:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <8 x i32> %a, i256 4
ret i32 %b
}
-; CHECK-LABEL: extractelement_index_5:
-define i8 @extractelement_index_5(<32 x i8> %a, i256 %i) nounwind {
- ; X64: movaps
- ; AVX: vmovaps
+define i32 @extractelement_v8i32_7(<8 x i32> %a) nounwind {
+; SSE2-LABEL: extractelement_v8i32_7:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v8i32_7:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $3, %xmm1, %eax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v8i32_7:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v8i32_7:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrd $3, %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <8 x i32> %a, i64 7
+ ret i32 %b
+}
+
+define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v2i64_0:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v2i64_0:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: retq
+ %b = extractelement <2 x i64> %a, i256 0
+ ret i64 %b
+}
+
+define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind {
+; SSE2-LABEL: extractelement_v2i64_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v2i64_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v2i64_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: retq
+ %b = extractelement <2 x i64> %a, i256 1
+ ret i64 %b
+}
+
+define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind {
+; SSE2-LABEL: extractelement_v4i64_1:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v4i64_1:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i64_1:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 1
+ ret i64 %b
+}
+
+define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
+; SSE2-LABEL: extractelement_v4i64_3:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: extractelement_v4i64_3:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm1, %rax
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v4i64_3:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v4i64_3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 3
+ ret i64 %b
+}
+
+;
+; ExtractElement - Variable Index
+;
+
+define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i8_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movb (%rdi,%rax), %al
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i8_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movb (%rdi,%rax), %al
+; AVX-NEXT: retq
+ %b = extractelement <16 x i8> %a, i256 %i
+ ret i8 %b
+}
+
+define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v32i8_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: leaq (%rsp), %rax
+; SSE-NEXT: movb (%rdi,%rax), %al
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v32i8_var:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: leaq (%rsp), %rax
+; AVX-NEXT: movb (%rdi,%rax), %al
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 %i
ret i8 %b
}
-; CHECK-LABEL: extractelement_index_6:
-define i8 @extractelement_index_6(<32 x i8> %a) nounwind {
- ; CHECK-NOT: pextr
+define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i16_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i16_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; AVX-NEXT: retq
+ %b = extractelement <8 x i16> %a, i256 %i
+ ret i16 %b
+}
+
+define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movzwl (%rsp,%rdi,2), %eax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i16_var:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: movzwl (%rsp,%rdi,2), %eax
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 %i
+ ret i16 %b
+}
+
+define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v4i32_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl -24(%rsp,%rdi,4), %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i32_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movl -24(%rsp,%rdi,4), %eax
+; AVX-NEXT: retq
+ %b = extractelement <4 x i32> %a, i256 %i
+ ret i32 %b
+}
+
+define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v8i32_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movl (%rsp,%rdi,4), %eax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: extractelement_v8i32_var:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: movl (%rsp,%rdi,4), %eax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: extractelement_v8i32_var:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %b = extractelement <8 x i32> %a, i256 %i
+ ret i32 %b
+}
+
+define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v2i64_var:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -24(%rsp,%rdi,8), %rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v2i64_var:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -24(%rsp,%rdi,8), %rax
+; AVX-NEXT: retq
+ %b = extractelement <2 x i64> %a, i256 %i
+ ret i64 %b
+}
+
+define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v4i64_var:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: movq %rsp, %rbp
+; SSE-NEXT: andq $-32, %rsp
+; SSE-NEXT: subq $64, %rsp
+; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movq (%rsp,%rdi,8), %rax
+; SSE-NEXT: movq %rbp, %rsp
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i64_var:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: movq (%rsp,%rdi,8), %rax
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 %i
+ ret i64 %b
+}
+
+;
+; ExtractElement - Constant (Out Of Range) Index
+;
+
+define i8 @extractelement_32i8_m1(<32 x i8> %a) nounwind {
+; SSE-LABEL: extractelement_32i8_m1:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_32i8_m1:
+; AVX: # BB#0:
+; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 -1
ret i8 %b
-} \ No newline at end of file
+}
+
+define i16 @extractelement_v16i16_m4(<16 x i16> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v16i16_m4:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v16i16_m4:
+; AVX: # BB#0:
+; AVX-NEXT: retq
+ %b = extractelement <16 x i16> %a, i256 -4
+ ret i16 %b
+}
+
+define i32 @extractelement_v8i32_15(<8 x i32> %a) nounwind {
+; SSE-LABEL: extractelement_v8i32_15:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v8i32_15:
+; AVX: # BB#0:
+; AVX-NEXT: retq
+ %b = extractelement <8 x i32> %a, i64 15
+ ret i32 %b
+}
+
+define i64 @extractelement_v4i64_4(<4 x i64> %a, i256 %i) nounwind {
+; SSE-LABEL: extractelement_v4i64_4:
+; SSE: # BB#0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: extractelement_v4i64_4:
+; AVX: # BB#0:
+; AVX-NEXT: retq
+ %b = extractelement <4 x i64> %a, i256 4
+ ret i64 %b
+}
diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index e50d353797a0..5855303e1278 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@@ -1,28 +1,48 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+avx -mcpu=btver2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=X64-SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i32 @t(<2 x i64>* %val) nounwind {
-; CHECK-LABEL: t:
-; CHECK-NOT: movd
-; CHECK: movl 8(
-; CHECK-NEXT: ret
- %tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
- %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1]
- ret i32 %tmp4
+; X32-SSE2-LABEL: t:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT: movl 8(%eax), %eax
+; X32-SSE2-NEXT: retl
+;
+; X64-SSSE3-LABEL: t:
+; X64-SSSE3: # BB#0:
+; X64-SSSE3-NEXT: movl 8(%rdi), %eax
+; X64-SSSE3-NEXT: retq
+;
+; X64-AVX-LABEL: t:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: movl 8(%rdi), %eax
+; X64-AVX-NEXT: retq
+ %tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
+ %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1]
+ %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1]
+ ret i32 %tmp4
}
; Case where extractelement of load ends up as undef.
; (Making sure this doesn't crash.)
define i32 @t2(<8 x i32>* %xp) {
-; CHECK-LABEL: t2:
-; CHECK: ret
+; X32-SSE2-LABEL: t2:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: retl
+;
+; X64-SSSE3-LABEL: t2:
+; X64-SSSE3: # BB#0:
+; X64-SSSE3-NEXT: retq
+;
+; X64-AVX-LABEL: t2:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: retq
%x = load <8 x i32>, <8 x i32>* %xp
- %Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32
-undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
+ %Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
%y = extractelement <8 x i32> %Shuff68, i32 0
ret i32 %y
}
@@ -36,10 +56,20 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
; need to special-case the checks.
define void @t3() {
-; CHECK-LABEL: t3:
-; CHECK: movupd
-; CHECK: movhpd
-
+; X32-SSE2-LABEL: t3:
+; X32-SSE2: # BB#0: # %bb
+; X32-SSE2-NEXT: movupd (%eax), %xmm0
+; X32-SSE2-NEXT: movhpd %xmm0, (%eax)
+;
+; X64-SSSE3-LABEL: t3:
+; X64-SSSE3: # BB#0: # %bb
+; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-SSSE3-NEXT: movlpd %xmm0, (%rax)
+;
+; X64-AVX-LABEL: t3:
+; X64-AVX: # BB#0: # %bb
+; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X64-AVX-NEXT: vmovlpd %xmm0, (%rax)
bb:
%tmp13 = load <2 x double>, <2 x double>* undef, align 1
%.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
@@ -52,9 +82,26 @@ bb:
; This is testing for an assertion - the extraction was assuming that the undef
; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
define i64 @t4(<2 x double>* %a) {
-; CHECK-LABEL: t4:
-; CHECK: mov
-; CHECK: ret
+; X32-SSE2-LABEL: t4:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT: movapd (%eax), %xmm0
+; X32-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: movd %xmm1, %eax
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-SSE2-NEXT: movd %xmm0, %edx
+; X32-SSE2-NEXT: retl
+;
+; X64-SSSE3-LABEL: t4:
+; X64-SSSE3: # BB#0:
+; X64-SSSE3-NEXT: movq (%rdi), %rax
+; X64-SSSE3-NEXT: retq
+;
+; X64-AVX-LABEL: t4:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: movq (%rdi), %rax
+; X64-AVX-NEXT: retq
%b = load <2 x double>, <2 x double>* %a, align 16
%c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>
%d = bitcast <2 x double> %c to <2 x i64>
diff --git a/test/CodeGen/X86/extractps.ll b/test/CodeGen/X86/extractps.ll
index fecd2faed321..7d4c2cf619a1 100644
--- a/test/CodeGen/X86/extractps.ll
+++ b/test/CodeGen/X86/extractps.ll
@@ -4,7 +4,7 @@
; RUN: grep "extractps \$1, %xmm0, " %t | count 1
; PR2647
-external global float, align 16 ; <float*>:0 [#uses=2]
+@0 = external global float, align 16 ; <float*>:0 [#uses=2]
define internal void @""() nounwind {
load float, float* @0, align 16 ; <float>:1 [#uses=1]
diff --git a/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..6b7d39548385
--- /dev/null
+++ b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
+
+define float @test_cvtsh_ss(i16 %a0) nounwind {
+; X32-LABEL: test_cvtsh_ss:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0
+; X32-NEXT: vmovss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_cvtsh_ss:
+; X64: # BB#0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: vmovd %eax, %xmm0
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+ %ins1 = insertelement <8 x i16> %ins0, i16 0, i32 1
+ %ins2 = insertelement <8 x i16> %ins1, i16 0, i32 2
+ %ins3 = insertelement <8 x i16> %ins2, i16 0, i32 3
+ %ins4 = insertelement <8 x i16> %ins3, i16 0, i32 4
+ %ins5 = insertelement <8 x i16> %ins4, i16 0, i32 5
+ %ins6 = insertelement <8 x i16> %ins5, i16 0, i32 6
+ %ins7 = insertelement <8 x i16> %ins6, i16 0, i32 7
+ %cvt = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %ins7)
+ %res = extractelement <4 x float> %cvt, i32 0
+ ret float %res
+}
+
+define i16 @test_cvtss_sh(float %a0) nounwind {
+; X32-LABEL: test_cvtss_sh:
+; X32: # BB#0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X32-NEXT: vmovd %xmm0, %eax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: test_cvtss_sh:
+; X64: # BB#0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X64-NEXT: vmovd %xmm0, %eax
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+ %ins0 = insertelement <4 x float> undef, float %a0, i32 0
+ %ins1 = insertelement <4 x float> %ins0, float 0.000000e+00, i32 1
+ %ins2 = insertelement <4 x float> %ins1, float 0.000000e+00, i32 2
+ %ins3 = insertelement <4 x float> %ins2, float 0.000000e+00, i32 3
+ %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %ins3, i32 0)
+ %res = extractelement <8 x i16> %cvt, i32 0
+ ret i16 %res
+}
+
+define <4 x float> @test_mm_cvtph_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtph_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtph_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %arg0)
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_mm256_cvtph_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtph_ps:
+; X32: # BB#0:
+; X32-NEXT: vcvtph2ps %xmm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtph_ps:
+; X64: # BB#0:
+; X64-NEXT: vcvtph2ps %xmm0, %ymm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %arg0)
+ ret <8 x float> %res
+}
+
+define <2 x i64> @test_mm_cvtps_ph(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_ph:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtps_ph:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
+ %res = bitcast <8 x i16> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm256_cvtps_ph(<8 x float> %a0) nounwind {
+; X32-LABEL: test_mm256_cvtps_ph:
+; X32: # BB#0:
+; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cvtps_ph:
+; X64: # BB#0:
+; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %cvt = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
+ %res = bitcast <8 x i16> %cvt to <2 x i64>
+ ret <2 x i64> %res
+}
+
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
+
+declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
diff --git a/test/CodeGen/X86/fast-isel-call.ll b/test/CodeGen/X86/fast-isel-call.ll
index 9fd07b521ab2..ee70404bcedf 100644
--- a/test/CodeGen/X86/fast-isel-call.ll
+++ b/test/CodeGen/X86/fast-isel-call.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 -mtriple=i686-apple-darwin8 2>/dev/null | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 -mtriple=i686-apple-darwin8 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
%struct.s = type {i32, i32, i32}
@@ -22,12 +23,12 @@ define void @test2(%struct.s* %d) nounwind {
call void @foo2(%struct.s* byval %d )
ret void
; CHECK-LABEL: test2:
-; CHECK: movl (%eax)
-; CHECK: movl {{.*}}, (%esp)
-; CHECK: movl 4(%eax)
-; CHECK: movl {{.*}}, 4(%esp)
-; CHECK: movl 8(%eax)
-; CHECK: movl {{.*}}, 8(%esp)
+; CHECK: movl (%eax), %ecx
+; CHECK: movl %ecx, (%esp)
+; CHECK: movl 4(%eax), %ecx
+; CHECK: movl %ecx, 4(%esp)
+; CHECK: movl 8(%eax), %eax
+; CHECK: movl %eax, 8(%esp)
}
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
@@ -53,3 +54,32 @@ define void @test4(i8* %a, i8* %b) {
; CHECK: movl $100, 8(%esp)
; CHECK: calll {{.*}}memcpy
}
+
+; STDERR-NOT: FastISel missed call: call x86_thiscallcc void @thiscallfun
+%struct.S = type { i8 }
+define void @test5() {
+entry:
+ %s = alloca %struct.S, align 1
+; CHECK-LABEL: test5:
+; CHECK: subl $12, %esp
+; CHECK: leal 8(%esp), %ecx
+; CHECK: movl $43, (%esp)
+; CHECK: calll {{.*}}thiscallfun
+; CHECK: addl $8, %esp
+ call x86_thiscallcc void @thiscallfun(%struct.S* %s, i32 43)
+ ret void
+}
+declare x86_thiscallcc void @thiscallfun(%struct.S*, i32) #1
+
+; STDERR-NOT: FastISel missed call: call x86_stdcallcc void @stdcallfun
+define void @test6() {
+entry:
+; CHECK-LABEL: test6:
+; CHECK: subl $12, %esp
+; CHECK: movl $43, (%esp)
+; CHECK: calll {{.*}}stdcallfun
+; CHECK: addl $8, %esp
+ call x86_stdcallcc void @stdcallfun(i32 43)
+ ret void
+}
+declare x86_stdcallcc void @stdcallfun(i32) #1
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
index 04dbac07690a..475d8fcf7f35 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch2.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch2.ll
@@ -5,7 +5,7 @@ define i32 @fcmp_oeq(float %x, float %y) {
; CHECK-LABEL: fcmp_oeq
; CHECK: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_1}}
-; CHECK-NEXT: jnp {{LBB.+_2}}
+; CHECK-NEXT: jp {{LBB.+_1}}
%1 = fcmp oeq float %x, %y
br i1 %1, label %bb1, label %bb2
bb2:
@@ -162,8 +162,7 @@ define i32 @fcmp_une(float %x, float %y) {
; CHECK-LABEL: fcmp_une
; CHECK: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_2}}
-; CHECK-NEXT: jp {{LBB.+_2}}
-; CHECK-NEXT: jmp {{LBB.+_1}}
+; CHECK-NEXT: jnp {{LBB.+_1}}
%1 = fcmp une float %x, %y
br i1 %1, label %bb1, label %bb2
bb2:
diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
index e54d0ca40078..8f09b2e38356 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
@@ -17,7 +17,7 @@ define i32 @fcmp_oeq2(float %x) {
; CHECK: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_1}}
-; CHECK-NEXT: jnp {{LBB.+_2}}
+; CHECK-NEXT: jp {{LBB.+_1}}
%1 = fcmp oeq float %x, 0.000000e+00
br i1 %1, label %bb1, label %bb2
bb2:
@@ -338,8 +338,7 @@ define i32 @fcmp_une2(float %x) {
; CHECK: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne {{LBB.+_2}}
-; CHECK-NEXT: jp {{LBB.+_2}}
-; CHECK-NEXT: jmp {{LBB.+_1}}
+; CHECK-NEXT: jnp {{LBB.+_1}}
%1 = fcmp une float %x, 0.000000e+00
br i1 %1, label %bb1, label %bb2
bb2:
diff --git a/test/CodeGen/X86/fast-isel-float-half-convertion.ll b/test/CodeGen/X86/fast-isel-float-half-convertion.ll
index 707a325bf41d..acb85fd171f5 100644
--- a/test/CodeGen/X86/fast-isel-float-half-convertion.ll
+++ b/test/CodeGen/X86/fast-isel-float-half-convertion.ll
@@ -4,7 +4,7 @@
define i16 @test_fp32_to_fp16(float %a) {
; CHECK-LABEL: test_fp32_to_fp16:
-; CHECK: vcvtps2ph $0, %xmm0, %xmm0
+; CHECK: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll
index 6a174dbf5a8a..2fc08fb4135d 100644
--- a/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -1,7 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4a -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE4A
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse4a -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE4A
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Scalar Stores
+;
define void @test_nti32(i32* nocapture %ptr, i32 %X) {
; ALL-LABEL: test_nti32:
@@ -34,10 +42,20 @@ define void @test_ntfloat(float* nocapture %ptr, float %X) {
; SSE4A-NEXT: movntss %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
+; SSE41-LABEL: test_ntfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movss %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
; AVX-LABEL: test_ntfloat:
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_ntfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovss %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store float %X, float* %ptr, align 4, !nontemporal !1
ret void
@@ -54,15 +72,29 @@ define void @test_ntdouble(double* nocapture %ptr, double %X) {
; SSE4A-NEXT: movntsd %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
+; SSE41-LABEL: test_ntdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movsd %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
; AVX-LABEL: test_ntdouble:
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovsd %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_ntdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovsd %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store double %X, double* %ptr, align 8, !nontemporal !1
ret void
}
+;
+; 128-bit Vector Stores
+;
+
define void @test_nt4xfloat(<4 x float>* nocapture %ptr, <4 x float> %X) {
; SSE-LABEL: test_nt4xfloat:
; SSE: # BB#0: # %entry
@@ -73,6 +105,11 @@ define void @test_nt4xfloat(<4 x float>* nocapture %ptr, <4 x float> %X) {
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store <4 x float> %X, <4 x float>* %ptr, align 16, !nontemporal !1
ret void
@@ -88,11 +125,76 @@ define void @test_nt2xdouble(<2 x double>* nocapture %ptr, <2 x double> %X) {
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovntpd %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt2xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntpd %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store <2 x double> %X, <2 x double>* %ptr, align 16, !nontemporal !1
ret void
}
+define void @test_nt16xi8(<16 x i8>* nocapture %ptr, <16 x i8> %X) {
+; SSE-LABEL: test_nt16xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x i8> %X, <16 x i8>* %ptr, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xi16(<8 x i16>* nocapture %ptr, <8 x i16> %X) {
+; SSE-LABEL: test_nt8xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x i16> %X, <8 x i16>* %ptr, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test_nt4xi32(<4 x i32>* nocapture %ptr, <4 x i32> %X) {
+; SSE-LABEL: test_nt4xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt4xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <4 x i32> %X, <4 x i32>* %ptr, align 16, !nontemporal !1
+ ret void
+}
+
define void @test_nt2xi64(<2 x i64>* nocapture %ptr, <2 x i64> %X) {
; SSE-LABEL: test_nt2xi64:
; SSE: # BB#0: # %entry
@@ -103,9 +205,984 @@ define void @test_nt2xi64(<2 x i64>* nocapture %ptr, <2 x i64> %X) {
; AVX: # BB#0: # %entry
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt2xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX512-NEXT: retq
entry:
store <2 x i64> %X, <2 x i64>* %ptr, align 16, !nontemporal !1
ret void
}
+;
+; 128-bit Vector Loads
+;
+
+define <4 x float> @test_load_nt4xfloat(<4 x float>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt4xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt4xfloat:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt4xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_load_nt4xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x float>, <4 x float>* %ptr, align 16, !nontemporal !1
+ ret <4 x float> %0
+}
+
+define <2 x double> @test_load_nt2xdouble(<2 x double>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt2xdouble:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt2xdouble:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movapd (%rdi), %xmm0
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt2xdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_load_nt2xdouble:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt2xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <2 x double>, <2 x double>* %ptr, align 16, !nontemporal !1
+ ret <2 x double> %0
+}
+
+define <16 x i8> @test_load_nt16xi8(<16 x i8>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt16xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt16xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x i8>, <16 x i8>* %ptr, align 16, !nontemporal !1
+ ret <16 x i8> %0
+}
+
+define <8 x i16> @test_load_nt8xi16(<8 x i16>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt8xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt8xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x i16>, <8 x i16>* %ptr, align 16, !nontemporal !1
+ ret <8 x i16> %0
+}
+
+define <4 x i32> @test_load_nt4xi32(<4 x i32>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt4xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt4xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x i32>, <4 x i32>* %ptr, align 16, !nontemporal !1
+ ret <4 x i32> %0
+}
+
+define <2 x i64> @test_load_nt2xi64(<2 x i64>* nocapture %ptr) {
+; SSE-LABEL: test_load_nt2xi64:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdqa (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_load_nt2xi64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt2xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <2 x i64>, <2 x i64>* %ptr, align 16, !nontemporal !1
+ ret <2 x i64> %0
+}
+
+;
+; 256-bit Vector Stores
+;
+
+define void @test_nt8xfloat(<8 x float>* nocapture %ptr, <8 x float> %X) {
+; SSE-LABEL: test_nt8xfloat:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x float> %X, <8 x float>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt4xdouble(<4 x double>* nocapture %ptr, <4 x double> %X) {
+; SSE-LABEL: test_nt4xdouble:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: movntpd %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt4xdouble:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <4 x double> %X, <4 x double>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt32xi8(<32 x i8>* nocapture %ptr, <32 x i8> %X) {
+; SSE-LABEL: test_nt32xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt32xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt32xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <32 x i8> %X, <32 x i8>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt16xi16(<16 x i16>* nocapture %ptr, <16 x i16> %X) {
+; SSE-LABEL: test_nt16xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x i16> %X, <16 x i16>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xi32(<8 x i32>* nocapture %ptr, <8 x i32> %X) {
+; SSE-LABEL: test_nt8xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x i32> %X, <8 x i32>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+define void @test_nt4xi64(<4 x i64>* nocapture %ptr, <4 x i64> %X) {
+; SSE-LABEL: test_nt4xi64:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt4xi64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt4xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <4 x i64> %X, <4 x i64>* %ptr, align 32, !nontemporal !1
+ ret void
+}
+
+;
+; 256-bit Vector Loads
+;
+
+define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xfloat:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xfloat:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xfloat:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x float>, <8 x float>* %ptr, align 32, !nontemporal !1
+ ret <8 x float> %0
+}
+
+define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt4xdouble:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: movapd 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt4xdouble:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movapd (%rdi), %xmm0
+; SSE4A-NEXT: movapd 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt4xdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt4xdouble:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovapd (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt4xdouble:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x double>, <4 x double>* %ptr, align 32, !nontemporal !1
+ ret <4 x double> %0
+}
+
+define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt32xi8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt32xi8:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt32xi8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt32xi8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt32xi8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt32xi8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <32 x i8>, <32 x i8>* %ptr, align 32, !nontemporal !1
+ ret <32 x i8> %0
+}
+
+define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt16xi16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt16xi16:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt16xi16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt16xi16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt16xi16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xi16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x i16>, <16 x i16>* %ptr, align 32, !nontemporal !1
+ ret <16 x i16> %0
+}
+
+define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xi32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xi32:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xi32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xi32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xi32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x i32>, <8 x i32>* %ptr, align 32, !nontemporal !1
+ ret <8 x i32> %0
+}
+
+define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt4xi64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt4xi64:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt4xi64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt4xi64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt4xi64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt4xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <4 x i64>, <4 x i64>* %ptr, align 32, !nontemporal !1
+ ret <4 x i64> %0
+}
+
+;
+; 512-bit Vector Stores
+;
+
+define void @test_nt16xfloat(<16 x float>* nocapture %ptr, <16 x float> %X) {
+; SSE-LABEL: test_nt16xfloat:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm2, 32(%rdi)
+; SSE-NEXT: movntps %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xfloat:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntps %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x float> %X, <16 x float>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xdouble(<8 x double>* nocapture %ptr, <8 x double> %X) {
+; SSE-LABEL: test_nt8xdouble:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: movntpd %xmm1, 16(%rdi)
+; SSE-NEXT: movntpd %xmm2, 32(%rdi)
+; SSE-NEXT: movntpd %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xdouble:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX-NEXT: vmovntpd %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntpd %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x double> %X, <8 x double>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt64xi8(<64 x i8>* nocapture %ptr, <64 x i8> %X) {
+; SSE-LABEL: test_nt64xi8:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt64xi8:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_nt64xi8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_nt64xi8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: retq
+entry:
+ store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt32xi16(<32 x i16>* nocapture %ptr, <32 x i16> %X) {
+; SSE-LABEL: test_nt32xi16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt32xi16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_nt32xi16:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_nt32xi16:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT: retq
+entry:
+ store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt16xi32(<16 x i32>* nocapture %ptr, <16 x i32> %X) {
+; SSE-LABEL: test_nt16xi32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt16xi32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt16xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <16 x i32> %X, <16 x i32>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+define void @test_nt8xi64(<8 x i64>* nocapture %ptr, <8 x i64> %X) {
+; SSE-LABEL: test_nt8xi64:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm2, 32(%rdi)
+; SSE-NEXT: movntdq %xmm3, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_nt8xi64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_nt8xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT: retq
+entry:
+ store <8 x i64> %X, <8 x i64>* %ptr, align 64, !nontemporal !1
+ ret void
+}
+
+;
+; 512-bit Vector Loads
+;
+
+define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt16xfloat:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt16xfloat:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt16xfloat:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt16xfloat:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt16xfloat:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xfloat:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x float>, <16 x float>* %ptr, align 64, !nontemporal !1
+ ret <16 x float> %0
+}
+
+define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xdouble:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movapd (%rdi), %xmm0
+; SSE2-NEXT: movapd 16(%rdi), %xmm1
+; SSE2-NEXT: movapd 32(%rdi), %xmm2
+; SSE2-NEXT: movapd 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xdouble:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movapd (%rdi), %xmm0
+; SSE4A-NEXT: movapd 16(%rdi), %xmm1
+; SSE4A-NEXT: movapd 32(%rdi), %xmm2
+; SSE4A-NEXT: movapd 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xdouble:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xdouble:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovapd (%rdi), %ymm0
+; AVX1-NEXT: vmovapd 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xdouble:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xdouble:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x double>, <8 x double>* %ptr, align 64, !nontemporal !1
+ ret <8 x double> %0
+}
+
+define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt64xi8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt64xi8:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt64xi8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt64xi8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt64xi8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_load_nt64xi8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_load_nt64xi8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %0 = load <64 x i8>, <64 x i8>* %ptr, align 64, !nontemporal !1
+ ret <64 x i8> %0
+}
+
+define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt32xi16:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt32xi16:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt32xi16:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt32xi16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt32xi16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_load_nt32xi16:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_load_nt32xi16:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %0 = load <32 x i16>, <32 x i16>* %ptr, align 64, !nontemporal !1
+ ret <32 x i16> %0
+}
+
+define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt16xi32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt16xi32:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt16xi32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt16xi32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt16xi32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt16xi32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <16 x i32>, <16 x i32>* %ptr, align 64, !nontemporal !1
+ ret <16 x i32> %0
+}
+
+define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
+; SSE2-LABEL: test_load_nt8xi64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_load_nt8xi64:
+; SSE4A: # BB#0: # %entry
+; SSE4A-NEXT: movaps (%rdi), %xmm0
+; SSE4A-NEXT: movaps 16(%rdi), %xmm1
+; SSE4A-NEXT: movaps 32(%rdi), %xmm2
+; SSE4A-NEXT: movaps 48(%rdi), %xmm3
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_load_nt8xi64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_load_nt8xi64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_load_nt8xi64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_load_nt8xi64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+entry:
+ %0 = load <8 x i64>, <8 x i64>* %ptr, align 64, !nontemporal !1
+ ret <8 x i64> %0
+}
+
!1 = !{i32 1}
diff --git a/test/CodeGen/X86/fast-isel-stackcheck.ll b/test/CodeGen/X86/fast-isel-stackcheck.ll
index 3b7318fa77d9..1398b3006699 100644
--- a/test/CodeGen/X86/fast-isel-stackcheck.ll
+++ b/test/CodeGen/X86/fast-isel-stackcheck.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx"
; CHECK-LABEL: foo:
; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
-; CHECK-NOT: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
define void @foo() #0 {
entry:
%_tags = alloca [3 x i32], align 4
@@ -16,8 +16,10 @@ entry:
}
; CHECK-LABEL: bar:
-; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
-; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %rax
+; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), %{{r.x}}
+; CHECK-DAG: movq ___stack_chk_guard@GOTPCREL(%rip), %[[GUARD:r.x]]
+; CHECK-DAG: movq {{[0-9]+}}(%rsp), %[[CANARY:r.x]]
+; CHECK: subq %[[CANARY]], %[[GUARD]]
define void @bar() #1 {
entry:
%vt = alloca [2 x double], align 16
diff --git a/test/CodeGen/X86/fast-isel-vecload.ll b/test/CodeGen/X86/fast-isel-vecload.ll
index 48eebf526f19..c5323f1c14f6 100644
--- a/test/CodeGen/X86/fast-isel-vecload.ll
+++ b/test/CodeGen/X86/fast-isel-vecload.ll
@@ -1,5 +1,6 @@
; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE --check-prefix=ALL
; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=ALL
+; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL
; Verify that fast-isel knows how to select aligned/unaligned vector loads.
; Also verify that the selected load instruction is in the correct domain.
@@ -183,3 +184,23 @@ entry:
%0 = load <2 x double>, <2 x double>* %V
ret <2 x double> %0
}
+
+define <8 x i64> @test_v8i64_alignment(<8 x i64>* %V) {
+; KNL-LABEL: test_v8i64_alignment:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqa64 (%rdi), %zmm0
+; KNL-NEXT: retq
+entry:
+ %0 = load <8 x i64>, <8 x i64>* %V, align 64
+ ret <8 x i64> %0
+}
+
+define <8 x i64> @test_v8i64(<8 x i64>* %V) {
+; KNL-LABEL: test_v8i64:
+; KNL: # BB#0: # %entry
+; KNL-NEXT: vmovdqu64 (%rdi), %zmm0
+; KNL-NEXT: retq
+entry:
+ %0 = load <8 x i64>, <8 x i64>* %V, align 4
+ ret <8 x i64> %0
+}
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d748cba2f8f8..ad0f11f4dc00 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort=1 | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-verbose 2>&1 >/dev/null | FileCheck %s --check-prefix=STDERR --allow-empty
; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort=1 | FileCheck %s --check-prefix=AVX
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -312,3 +313,10 @@ define void @allocamaterialize() {
call void @takesi32ptr(i32* %a)
ret void
}
+
+; STDERR-NOT: FastISel missed terminator: ret void
+; CHECK-LABEL: win64ccfun
+define x86_64_win64cc void @win64ccfun(i32 %i) {
+; CHECK: ret
+ ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index 8049c72ec018..8cddee5a7cd0 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -1,4 +1,5 @@
; RUN: llc -fast-isel -O0 -mcpu=generic -mtriple=i386-apple-darwin10 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -fast-isel -O0 -mcpu=generic -mtriple=i386-apple-darwin10 -relocation-model=pic < %s -fast-isel-verbose 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
; This should use flds to set the return value.
; CHECK-LABEL: test0:
@@ -18,11 +19,38 @@ define void @test1({i32, i32, i32, i32}* sret %p) nounwind {
ret void
}
+; This should pop 8 bytes on return.
+; CHECK-LABEL: thiscallfun:
+; CHECK: retl $8
+define x86_thiscallcc i32 @thiscallfun(i32* %this, i32 %a, i32 %b) nounwind {
+; STDERR-NOT: FastISel missed terminator: ret i32 12345
+ ret i32 12345
+}
+
+; Here, the callee pop doesn't fit the 16 bit immediate -- see x86-big-ret.ll
+; This checks that -fast-isel doesn't miscompile this.
+; CHECK-LABEL: thiscall_large:
+; CHECK: popl %ecx
+; CHECK-NEXT: addl $65536, %esp
+; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: retl
+define x86_thiscallcc void @thiscall_large(i32* %this, [65533 x i8]* byval %b) nounwind {
+ ret void
+}
+
+; This should pop 4 bytes on return.
+; CHECK-LABEL: stdcallfun:
+; CHECK: retl $4
+define x86_stdcallcc i32 @stdcallfun(i32 %a) nounwind {
+; STDERR-NOT: FastISel missed terminator: ret i32 54321
+ ret i32 54321
+}
+
; Properly initialize the pic base.
; CHECK-LABEL: test2:
; CHECK-NOT: HHH
-; CHECK: call{{.*}}L2$pb
-; CHECK-NEXT: L2$pb:
+; CHECK: call{{.*}}L5$pb
+; CHECK-NEXT: L5$pb:
; CHECK-NEXT: pop
; CHECK: HHH
; CHECK: retl
@@ -75,6 +103,7 @@ entry:
; SDag-ISel's arg push:
; CHECK: movl %esp, [[REGISTER:%[a-z]+]]
; CHECK: movl $42, ([[REGISTER]])
-; CHECK: movl __imp__test5dllimport
+; CHECK: movl L_test5dllimport$non_lazy_ptr-L8$pb(%eax), %eax
+
}
declare dllimport i32 @test5dllimport(i32)
diff --git a/test/CodeGen/X86/fastmath-float-half-conversion.ll b/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 29308735cca2..637fcc215958 100644
--- a/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -5,7 +5,7 @@ define zeroext i16 @test1_fast(double %d) #0 {
; ALL-LABEL: test1_fast:
; F16C-NOT: callq {{_+}}truncdfhf2
; F16C: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; F16C-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX: callq {{_+}}truncdfhf2
; ALL: ret
entry:
@@ -19,7 +19,7 @@ define zeroext i16 @test2_fast(x86_fp80 %d) #0 {
; F16C: fldt
; F16C-NEXT: fstps
; F16C-NEXT: vmovss
-; F16C-NEXT: vcvtps2ph $0, %xmm0, %xmm0
+; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX: callq {{_+}}truncxfhf2
; ALL: ret
entry:
diff --git a/test/CodeGen/X86/fixup-bw-copy.ll b/test/CodeGen/X86/fixup-bw-copy.ll
new file mode 100644
index 000000000000..9067dfd29c17
--- /dev/null
+++ b/test/CodeGen/X86/fixup-bw-copy.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=1 -mtriple=x86_64-- < %s | FileCheck --check-prefix=X64 --check-prefix=BWON64 %s
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=0 -mtriple=x86_64-- < %s | FileCheck --check-prefix=X64 --check-prefix=BWOFF64 %s
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=1 -mtriple=i386-- < %s | FileCheck --check-prefix=X32 --check-prefix=BWON32 %s
+; RUN: llc -verify-machineinstrs -fixup-byte-word-insts=0 -mtriple=i386-- < %s | FileCheck --check-prefix=X32 --check-prefix=BWOFF32 %s
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+
+define i8 @test_movb(i8 %a0) {
+; BWON64-LABEL: test_movb:
+; BWON64: # BB#0:
+; BWON64-NEXT: movl %edi, %eax
+; BWON64-NEXT: retq
+;
+; BWOFF64-LABEL: test_movb:
+; BWOFF64: # BB#0:
+; BWOFF64-NEXT: movb %dil, %al
+; BWOFF64-NEXT: retq
+;
+; X32-LABEL: test_movb:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: retl
+ ret i8 %a0
+}
+
+define i16 @test_movw(i16 %a0) {
+; BWON64-LABEL: test_movw:
+; BWON64: # BB#0:
+; BWON64-NEXT: movl %edi, %eax
+; BWON64-NEXT: retq
+;
+; BWOFF64-LABEL: test_movw:
+; BWOFF64: # BB#0:
+; BWOFF64-NEXT: movw %di, %ax
+; BWOFF64-NEXT: retq
+;
+; BWON32-LABEL: test_movw:
+; BWON32: # BB#0:
+; BWON32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; BWON32-NEXT: retl
+;
+; BWOFF32-LABEL: test_movw:
+; BWOFF32: # BB#0:
+; BWOFF32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; BWOFF32-NEXT: retl
+ ret i16 %a0
+}
+
+; Verify we don't mess with H-reg copies (only generated in 32-bit mode).
+define i8 @test_movb_hreg(i16 %a0) {
+; X64-LABEL: test_movb_hreg:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl $8, %eax
+; X64-NEXT: addb %dil, %al
+; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-LABEL: test_movb_hreg:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addb %al, %ah
+; X32-NEXT: movb %ah, %al
+; X32-NEXT: retl
+ %tmp0 = trunc i16 %a0 to i8
+ %tmp1 = lshr i16 %a0, 8
+ %tmp2 = trunc i16 %tmp1 to i8
+ %tmp3 = add i8 %tmp0, %tmp2
+ ret i8 %tmp3
+}
diff --git a/test/CodeGen/X86/fixup-bw-copy.mir b/test/CodeGen/X86/fixup-bw-copy.mir
new file mode 100644
index 000000000000..beff513cdbf5
--- /dev/null
+++ b/test/CodeGen/X86/fixup-bw-copy.mir
@@ -0,0 +1,156 @@
+# RUN: llc -run-pass x86-fixup-bw-insts -mtriple=x86_64-- -o - %s | FileCheck %s
+
+# Verify that we correctly deal with the flag edge cases when replacing
+# copies by bigger copies, which is a pretty unusual transform.
+
+--- |
+ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+ define i8 @test_movb_killed(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impuse(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impdef_gr64(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impdef_gr32(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i8 @test_movb_impdef_gr16(i8 %a0) {
+ ret i8 %a0
+ }
+
+ define i16 @test_movw_impdef_gr32(i16 %a0) {
+ ret i16 %a0
+ }
+
+ define i16 @test_movw_impdef_gr64(i16 %a0) {
+ ret i16 %a0
+ }
+
+...
+
+---
+name: test_movb_killed
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr killed %dil
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impuse
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr %dil, implicit %edi
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impdef_gr64
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil, implicit-def %rax
+ %al = MOV8rr %dil, implicit-def %rax
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impdef_gr32
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr %dil, implicit-def %eax
+ RETQ killed %al
+
+...
+
+---
+name: test_movb_impdef_gr16
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %dil
+ %al = MOV8rr %dil, implicit-def %ax
+ RETQ killed %al
+
+...
+
+---
+name: test_movw_impdef_gr32
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %di
+ %ax = MOV16rr %di, implicit-def %eax
+ RETQ killed %ax
+
+...
+
+---
+name: test_movw_impdef_gr64
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%edi' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK: %eax = MOV32rr undef %edi, implicit %di, implicit-def %rax
+ %ax = MOV16rr %di, implicit-def %rax
+ RETQ killed %ax
+
+...
diff --git a/test/CodeGen/X86/fixup-bw-inst.ll b/test/CodeGen/X86/fixup-bw-inst.ll
new file mode 100644
index 000000000000..6f83e6362d56
--- /dev/null
+++ b/test/CodeGen/X86/fixup-bw-inst.ll
@@ -0,0 +1,126 @@
+; RUN: llc -fixup-byte-word-insts=1 -march=x86-64 < %s | \
+; RUN: FileCheck -check-prefix CHECK -check-prefix BWON %s
+; RUN: llc -fixup-byte-word-insts=0 -march=x86-64 < %s | \
+; RUN: FileCheck -check-prefix CHECK -check-prefix BWOFF %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+
+; This has byte loads interspersed with byte stores, in a single
+; basic-block loop. The upper portion should be dead, so the movb loads
+; should have been changed into movzbl instead.
+; CHECK-LABEL: foo1
+; load:
+; BWON: movzbl
+; BWOFF: movb
+; store:
+; CHECK: movb
+; load:
+; BWON: movzbl
+; BWOFF: movb
+; store:
+; CHECK: movb
+; CHECK: ret
+define void @foo1(i32 %count,
+ %struct.A* noalias nocapture %q,
+ %struct.A* noalias nocapture %p)
+ nounwind uwtable noinline ssp {
+ %1 = icmp sgt i32 %count, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0
+ %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
+ %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
+ br label %a4
+
+a4: ; preds = %4, %.lr.ph
+ %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+ %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
+ %a5 = load i8, i8* %2, align 1
+ %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
+ store i8 %a5, i8* %a7, align 1
+ %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
+ %a6 = load i8, i8* %3, align 1
+ store i8 %a6, i8* %a8, align 1
+ %a9 = add nsw i32 %i.02, 1
+ %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
+ %exitcond = icmp eq i32 %a9, %count
+ br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge: ; preds = %4, %0
+ ret void
+}
+
+%struct.B = type { i16, i16, i16, i16, i16, i16, i16, i16 }
+
+; This has word loads interspersed with word stores.
+; The upper portion should be dead, so the movw loads should have
+; been changed into movzwl instead.
+; CHECK-LABEL: foo2
+; load:
+; BWON: movzwl
+; BWOFF: movw
+; store:
+; CHECK: movw
+; load:
+; BWON: movzwl
+; BWOFF: movw
+; store:
+; CHECK: movw
+; CHECK: ret
+define void @foo2(i32 %count,
+ %struct.B* noalias nocapture %q,
+ %struct.B* noalias nocapture %p)
+ nounwind uwtable noinline ssp {
+ %1 = icmp sgt i32 %count, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0
+ %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
+ %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
+ br label %a4
+
+a4: ; preds = %4, %.lr.ph
+ %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+ %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %a10, %a4 ]
+ %a5 = load i16, i16* %2, align 2
+ %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
+ store i16 %a5, i16* %a7, align 2
+ %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
+ %a6 = load i16, i16* %3, align 2
+ store i16 %a6, i16* %a8, align 2
+ %a9 = add nsw i32 %i.02, 1
+ %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
+ %exitcond = icmp eq i32 %a9, %count
+ br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge: ; preds = %4, %0
+ ret void
+}
+
+; This test contains nothing but a simple byte load and store. Since
+; movb encodes smaller, we do not want to use movzbl unless in a tight loop.
+; So this test checks that movb is used.
+; CHECK-LABEL: foo3:
+; CHECK: movb
+; CHECK: movb
+define void @foo3(i8 *%dst, i8 *%src) {
+ %t0 = load i8, i8 *%src, align 1
+ store i8 %t0, i8 *%dst, align 1
+ ret void
+}
+
+; This test contains nothing but a simple word load and store. Since
+; movw and movzwl are the same size, we should always choose to use
+; movzwl instead.
+; CHECK-LABEL: foo4:
+; BWON: movzwl
+; BWOFF: movw
+; CHECK: movw
+define void @foo4(i16 *%dst, i16 *%src) {
+ %t0 = load i16, i16 *%src, align 2
+ store i16 %t0, i16 *%dst, align 2
+ ret void
+}
diff --git a/test/CodeGen/X86/float-conv-elim.ll b/test/CodeGen/X86/float-conv-elim.ll
index 3feff851d91a..7ccad2b80c8b 100644
--- a/test/CodeGen/X86/float-conv-elim.ll
+++ b/test/CodeGen/X86/float-conv-elim.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mcpu=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-unknown-linux-gnu -march=x86-64 -mcpu=x86-64 < %s | FileCheck %s
; Make sure the float conversion is folded away as it should be.
; CHECK-LABEL: foo
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 76a4acf00f90..62d1b826b545 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=ALL --check-prefix=FMA4
@@ -22,7 +23,7 @@ define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%res = fadd float %x, %a2
@@ -83,7 +84,7 @@ define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%res = fadd double %x, %a2
@@ -148,7 +149,7 @@ define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%res = fsub float %x, %a2
@@ -209,7 +210,7 @@ define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %x, %a2
@@ -274,7 +275,7 @@ define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fnmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%res = fsub float %a2, %x
@@ -335,7 +336,7 @@ define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fnmadd:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%res = fsub double %a2, %x
@@ -400,7 +401,7 @@ define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
; AVX512-LABEL: test_f32_fnmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
%y = fsub float -0.000000e+00, %x
@@ -464,7 +465,7 @@ define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
; AVX512-LABEL: test_f64_fnmsub:
; AVX512: # BB#0:
; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
%y = fsub double -0.000000e+00, %x
@@ -533,7 +534,7 @@ define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x
; AVX512: # BB#0:
; AVX512-NEXT: vmovaps (%rdi), %xmm2
; AVX512-NEXT: vfmadd213ps %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0
%y = fmul <4 x float> %x, %a1
@@ -556,7 +557,7 @@ define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, <
; AVX512: # BB#0:
; AVX512-NEXT: vmovapd (%rdi), %xmm2
; AVX512-NEXT: vfmsub213pd %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%x = load <2 x double>, <2 x double>* %a0
%y = fmul <2 x double> %x, %a1
@@ -829,7 +830,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vfmadd213ss %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%t1 = fsub float 1.0, %t
%tx = fmul float %x, %t
@@ -853,7 +854,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
;
; AVX512-LABEL: test_v4f32_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %xmm2, %xmm3
; AVX512-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm3
; AVX512-NEXT: vfmadd213ps %xmm3, %xmm2, %xmm0
; AVX512-NEXT: retq
@@ -879,7 +880,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
;
; AVX512-LABEL: test_v8f32_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %ymm2, %ymm3
; AVX512-NEXT: vfnmadd213ps %ymm1, %ymm1, %ymm3
; AVX512-NEXT: vfmadd213ps %ymm3, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -907,7 +908,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
; AVX512: # BB#0:
; AVX512-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vfmadd213sd %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vmovaps %zmm2, %zmm0
+; AVX512-NEXT: vmovaps %xmm2, %xmm0
; AVX512-NEXT: retq
%t1 = fsub double 1.0, %t
%tx = fmul double %x, %t
@@ -931,7 +932,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
;
; AVX512-LABEL: test_v2f64_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %xmm2, %xmm3
; AVX512-NEXT: vfnmadd213pd %xmm1, %xmm1, %xmm3
; AVX512-NEXT: vfmadd213pd %xmm3, %xmm2, %xmm0
; AVX512-NEXT: retq
@@ -957,7 +958,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
;
; AVX512-LABEL: test_v4f64_interp:
; AVX512: # BB#0:
-; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vmovaps %ymm2, %ymm3
; AVX512-NEXT: vfnmadd213pd %ymm1, %ymm1, %ymm3
; AVX512-NEXT: vfmadd213pd %ymm3, %ymm2, %ymm0
; AVX512-NEXT: retq
@@ -1101,7 +1102,7 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
; AVX512: # BB#0:
; AVX512-NEXT: vfmadd231ps {{.*}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
%m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
@@ -1128,7 +1129,7 @@ define double @test_f64_fneg_fmul(double %x, double %y) #0 {
; AVX512: # BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
; AVX512-NEXT: retq
%m = fmul nsz double %x, %y
%n = fsub double -0.0, %m
@@ -1150,7 +1151,7 @@ define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
;
; AVX512-LABEL: test_v4f32_fneg_fmul:
; AVX512: # BB#0:
-; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpxord %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%m = fmul nsz <4 x float> %x, %y
@@ -1173,7 +1174,7 @@ define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
;
; AVX512-LABEL: test_v4f64_fneg_fmul:
; AVX512: # BB#0:
-; AVX512-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; AVX512-NEXT: vpxord %ymm2, %ymm2, %ymm2
; AVX512-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%m = fmul nsz <4 x double> %x, %y
diff --git a/test/CodeGen/X86/fold-push.ll b/test/CodeGen/X86/fold-push.ll
index eaf91351021f..9d3afd1c449b 100644
--- a/test/CodeGen/X86/fold-push.ll
+++ b/test/CodeGen/X86/fold-push.ll
@@ -14,7 +14,7 @@ define void @test(i32 %a, i32 %b) optsize nounwind {
; SLM: movl (%esp), [[RELOAD:%e..]]
; SLM-NEXT: pushl [[RELOAD]]
; CHECK: calll
-; CHECK-NEXT: addl $4, %esp
+; CHECK-NEXT: addl $8, %esp
%c = add i32 %a, %b
call void @foo(i32 %c)
call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"()
diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
index 62fed4219387..5c481197c3b4 100644
--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@@ -6,10 +6,7 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386--netbsd"
; CHECK-LABEL: fn1
-; CHECK: shldl {{.*#+}} 4-byte Folded Spill
-; CHECK: orl {{.*#+}} 4-byte Folded Reload
-; CHECK: shldl {{.*#+}} 4-byte Folded Spill
-; CHECK: orl {{.*#+}} 4-byte Folded Reload
+; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: addl {{.*#+}} 4-byte Folded Reload
; CHECK: imull {{.*#+}} 4-byte Folded Reload
; CHECK: orl {{.*#+}} 4-byte Folded Reload
diff --git a/test/CodeGen/X86/fold-vector-sext-zext.ll b/test/CodeGen/X86/fold-vector-sext-zext.ll
index aeaab4479085..6299280eb98d 100644
--- a/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -mattr=+avx | FileCheck %s
; Verify that the backend correctly folds a sign/zero extend of a vector where
@@ -6,8 +7,11 @@
; simple loads from constant pool of the result. That is because the resulting
; vector should be known at static time.
-
define <4 x i16> @test1() {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -15,11 +19,12 @@ define <4 x i16> @test1() {
%5 = sext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test1
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i16> @test2() {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -27,11 +32,12 @@ define <4 x i16> @test2() {
%5 = sext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test2
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test3() {
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -39,11 +45,12 @@ define <4 x i32> @test3() {
%5 = sext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test3
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test4() {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -51,12 +58,12 @@ define <4 x i32> @test4() {
%5 = sext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test4
-; CHECK: vmovaps
-; CHECK-NEXT: ret
-
define <4 x i64> @test5() {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [0,18446744073709551615,2,18446744073709551613]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -64,12 +71,12 @@ define <4 x i64> @test5() {
%5 = sext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test5
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i64> @test6() {
+; CHECK-LABEL: test6:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <u,18446744073709551615,u,18446744073709551613>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -77,12 +84,12 @@ define <4 x i64> @test6() {
%5 = sext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test6
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test7() {
+; CHECK-LABEL: test7:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -94,11 +101,12 @@ define <8 x i16> @test7() {
%9 = sext <8 x i8> %4 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test7
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test8() {
+; CHECK-LABEL: test8:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <0,4294967295,2,4294967293,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -110,12 +118,12 @@ define <8 x i32> @test8() {
%9 = sext <8 x i8> %4 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test8
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test9() {
+; CHECK-LABEL: test9:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 undef, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 undef, i32 2
@@ -127,11 +135,12 @@ define <8 x i16> @test9() {
%9 = sext <8 x i8> %4 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test9
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test10() {
+; CHECK-LABEL: test10:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,u,u,u,u,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 undef, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -143,13 +152,12 @@ define <8 x i32> @test10() {
%9 = sext <8 x i8> %4 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test10
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
-
define <4 x i16> @test11() {
+; CHECK-LABEL: test11:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -157,11 +165,12 @@ define <4 x i16> @test11() {
%5 = zext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test11
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test12() {
+; CHECK-LABEL: test12:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -169,11 +178,12 @@ define <4 x i32> @test12() {
%5 = zext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test12
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i64> @test13() {
+; CHECK-LABEL: test13:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253]
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -181,12 +191,12 @@ define <4 x i64> @test13() {
%5 = zext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test13
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i16> @test14() {
+; CHECK-LABEL: test14:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 undef, i32 2
@@ -194,11 +204,12 @@ define <4 x i16> @test14() {
%5 = zext <4 x i8> %4 to <4 x i16>
ret <4 x i16> %5
}
-; CHECK-LABEL: test14
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i32> @test15() {
+; CHECK-LABEL: test15:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,u,2,u>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
%2 = insertelement <4 x i8> %1, i8 undef, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -206,11 +217,12 @@ define <4 x i32> @test15() {
%5 = zext <4 x i8> %4 to <4 x i32>
ret <4 x i32> %5
}
-; CHECK-LABEL: test15
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <4 x i64> @test16() {
+; CHECK-LABEL: test16:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <u,255,2,u>
+; CHECK-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
%2 = insertelement <4 x i8> %1, i8 -1, i32 1
%3 = insertelement <4 x i8> %2, i8 2, i32 2
@@ -218,12 +230,12 @@ define <4 x i64> @test16() {
%5 = zext <4 x i8> %4 to <4 x i64>
ret <4 x i64> %5
}
-; CHECK-LABEL: test16
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test17() {
+; CHECK-LABEL: test17:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249]
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -235,11 +247,12 @@ define <8 x i16> @test17() {
%9 = zext <8 x i8> %8 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test17
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test18() {
+; CHECK-LABEL: test18:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249]
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -251,12 +264,12 @@ define <8 x i32> @test18() {
%9 = zext <8 x i8> %8 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test18
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i16> @test19() {
+; CHECK-LABEL: test19:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253,u,251,u,249>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 undef, i32 0
%2 = insertelement <8 x i8> %1, i8 -1, i32 1
%3 = insertelement <8 x i8> %2, i8 undef, i32 2
@@ -268,11 +281,12 @@ define <8 x i16> @test19() {
%9 = zext <8 x i8> %8 to <8 x i16>
ret <8 x i16> %9
}
-; CHECK-LABEL: test19
-; CHECK: vmovaps
-; CHECK-NEXT: ret
define <8 x i32> @test20() {
+; CHECK-LABEL: test20:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,253,4,u,6,u>
+; CHECK-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
%2 = insertelement <8 x i8> %1, i8 undef, i32 1
%3 = insertelement <8 x i8> %2, i8 2, i32 2
@@ -284,8 +298,3 @@ define <8 x i32> @test20() {
%9 = zext <8 x i8> %8 to <8 x i32>
ret <8 x i32> %9
}
-; CHECK-LABEL: test20
-; CHECK-NOT: vinsertf128
-; CHECK: vmovaps
-; CHECK-NEXT: ret
-
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
index d0cf34170081..8d42680e199b 100644
--- a/test/CodeGen/X86/force-align-stack-alloca.ll
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -32,15 +32,21 @@ define i64 @g(i32 %i) nounwind {
; CHECK: movl %{{...}}, %esp
; CHECK-NOT: {{[^ ,]*}}, %esp
;
-; Next we set up the memset call, and then undo it.
+; Next we set up the memset call.
; CHECK: subl $20, %esp
; CHECK-NOT: {{[^ ,]*}}, %esp
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: pushl
; CHECK: calll memset
-; CHECK-NEXT: addl $32, %esp
+;
+; Deallocating 32 bytes of outgoing call frame for memset and
+; allocating 28 bytes for calling f yields a 4-byte adjustment:
+; CHECK-NEXT: addl $4, %esp
; CHECK-NOT: {{[^ ,]*}}, %esp
;
-; Next we set up the call to 'f'.
-; CHECK: subl $28, %esp
+; And move on to call 'f', and then restore the stack.
+; CHECK: pushl
; CHECK-NOT: {{[^ ,]*}}, %esp
; CHECK: calll f
; CHECK-NEXT: addl $32, %esp
diff --git a/test/CodeGen/X86/fp-logic.ll b/test/CodeGen/X86/fp-logic.ll
index 64c3f6b79a23..9ab6751d6548 100644
--- a/test/CodeGen/X86/fp-logic.ll
+++ b/test/CodeGen/X86/fp-logic.ll
@@ -262,3 +262,51 @@ define float @movmsk(float %x) {
ret float %bc2
}
+define double @bitcast_fabs(double %x) {
+; CHECK-LABEL: bitcast_fabs:
+; CHECK: # BB#0:
+; CHECK-NEXT: andpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast double %x to i64
+ %and = and i64 %bc1, 9223372036854775807
+ %bc2 = bitcast i64 %and to double
+ ret double %bc2
+}
+
+define float @bitcast_fneg(float %x) {
+; CHECK-LABEL: bitcast_fneg:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast float %x to i32
+ %xor = xor i32 %bc1, 2147483648
+ %bc2 = bitcast i32 %xor to float
+ ret float %bc2
+}
+
+define <2 x double> @bitcast_fabs_vec(<2 x double> %x) {
+; CHECK-LABEL: bitcast_fabs_vec:
+; CHECK: # BB#0:
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast <2 x double> %x to <2 x i64>
+ %and = and <2 x i64> %bc1, <i64 9223372036854775807, i64 9223372036854775807>
+ %bc2 = bitcast <2 x i64> %and to <2 x double>
+ ret <2 x double> %bc2
+}
+
+define <4 x float> @bitcast_fneg_vec(<4 x float> %x) {
+; CHECK-LABEL: bitcast_fneg_vec:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %bc1 = bitcast <4 x float> %x to <4 x i32>
+ %xor = xor <4 x i32> %bc1, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
+ %bc2 = bitcast <4 x i32> %xor to <4 x float>
+ ret <4 x float> %bc2
+}
+
diff --git a/test/CodeGen/X86/fp-une-cmp.ll b/test/CodeGen/X86/fp-une-cmp.ll
index 7f772d11da9a..653040053c27 100644
--- a/test/CodeGen/X86/fp-une-cmp.ll
+++ b/test/CodeGen/X86/fp-une-cmp.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
; <rdar://problem/7859988>
; Make sure we don't generate more jumps than we need to. We used to generate
@@ -19,25 +21,115 @@
; addsd ...
; LBB0_2:
-; CHECK: func
-; CHECK: jne [[LABEL:.*]]
-; CHECK-NEXT: jp [[LABEL]]
-; CHECK-NOT: jmp
+define double @rdar_7859988(double %x, double %y) nounwind readnone optsize ssp {
+; CHECK-LABEL: rdar_7859988:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: mulsd %xmm1, %xmm0
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: jne .LBB0_2
+; CHECK-NEXT: jp .LBB0_2
+; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: .LBB0_2: # %bb2
+; CHECK-NEXT: retq
-define float @func(float %x, float %y) nounwind readnone optsize ssp {
entry:
- %0 = fpext float %x to double
- %1 = fpext float %y to double
- %2 = fmul double %0, %1
- %3 = fcmp une double %2, 0.000000e+00
- br i1 %3, label %bb2, label %bb1
+ %mul = fmul double %x, %y
+ %cmp = fcmp une double %mul, 0.000000e+00
+ br i1 %cmp, label %bb2, label %bb1
bb1:
- %4 = fadd double %2, -1.000000e+00
+ %add = fadd double %mul, -1.000000e+00
br label %bb2
bb2:
- %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
- %.0 = fptrunc double %.0.in to float
- ret float %.0
+ %phi = phi double [ %add, %bb1 ], [ %mul, %entry ]
+ ret double %phi
}
+
+define double @profile_metadata(double %x, double %y) {
+; CHECK-LABEL: profile_metadata:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: mulsd %xmm1, %xmm0
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: jne .LBB1_1
+; CHECK-NEXT: jp .LBB1_1
+; CHECK-NEXT: .LBB1_2: # %bb2
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB1_1: # %bb1
+; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0
+; CHECK-NEXT: jmp .LBB1_2
+
+entry:
+ %mul = fmul double %x, %y
+ %cmp = fcmp une double %mul, 0.000000e+00
+ br i1 %cmp, label %bb1, label %bb2, !prof !1
+
+bb1:
+ %add = fadd double %mul, -1.000000e+00
+ br label %bb2
+
+bb2:
+ %phi = phi double [ %add, %bb1 ], [ %mul, %entry ]
+ ret double %phi
+}
+
+; Test if the negation of the non-equality check between floating points are
+; translated to jnp followed by jne.
+
+define void @foo(float %f) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-NEXT: jne .LBB2_2
+; CHECK-NEXT: jnp .LBB2_1
+; CHECK-NEXT: .LBB2_2: # %if.then
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: .LBB2_1: # %if.end
+; CHECK-NEXT: retq
+entry:
+ %cmp = fcmp une float %f, 0.000000e+00
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ tail call void @a()
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; Test that an FP oeq/une conditional branch can be inverted successfully even
+; when the true and false targets are the same (PR27750).
+;
+; CHECK-LABEL: pr27750
+; CHECK: ucomiss
+; CHECK-NEXT: jne [[TARGET:.*]]
+; CHECK-NEXT: jp [[TARGET]]
+define void @pr27750(i32* %b, float %x, i1 %y) {
+entry:
+ br label %for.cond
+
+for.cond:
+ br label %for.cond1
+
+for.cond1:
+ br i1 %y, label %for.body3.lr.ph, label %for.end
+
+for.body3.lr.ph:
+ store i32 0, i32* %b, align 4
+ br label %for.end
+
+for.end:
+; After block %for.cond gets eliminated, the two target blocks of this
+; conditional block are the same.
+ %tobool = fcmp une float %x, 0.000000e+00
+ br i1 %tobool, label %for.cond, label %for.cond1
+}
+
+declare void @a()
+
+!1 = !{!"branch_weights", i32 1, i32 1000}
diff --git a/test/CodeGen/X86/fp128-cast.ll b/test/CodeGen/X86/fp128-cast.ll
index 73878e31d0ef..2d872498dfc7 100644
--- a/test/CodeGen/X86/fp128-cast.ll
+++ b/test/CodeGen/X86/fp128-cast.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -O2 -mtriple=i686-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=X32
; Check soft floating point conversion function calls.
@@ -17,11 +18,17 @@ entry:
%conv = fpext float %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestFPExtF32_F128:
-; CHECK: movss vf32(%rip), %xmm0
-; CHECK-NEXT: callq __extendsftf2
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPExtF32_F128:
+; X32: flds vf32
+; X32: fstps
+; X32: calll __extendsftf2
+; X32: retl
+;
+; X64-LABEL: TestFPExtF32_F128:
+; X64: movss vf32(%rip), %xmm0
+; X64-NEXT: callq __extendsftf2
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestFPExtF64_F128() {
@@ -30,11 +37,17 @@ entry:
%conv = fpext double %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestFPExtF64_F128:
-; CHECK: movsd vf64(%rip), %xmm0
-; CHECK-NEXT: callq __extenddftf2
-; CHECK-NEXT: movapd %xmm0, vf128(%rip)
-; CHECK: ret
+; X32-LABEL: TestFPExtF64_F128:
+; X32: fldl vf64
+; X32: fstpl
+; X32: calll __extenddftf2
+; X32: retl
+;
+; X64-LABEL: TestFPExtF64_F128:
+; X64: movsd vf64(%rip), %xmm0
+; X64-NEXT: callq __extenddftf2
+; X64-NEXT: movapd %xmm0, vf128(%rip)
+; X64: ret
}
define void @TestFPToSIF128_I32() {
@@ -43,11 +56,15 @@ entry:
%conv = fptosi fp128 %0 to i32
store i32 %conv, i32* @vi32, align 4
ret void
-; CHECK-LABEL: TestFPToSIF128_I32:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixtfsi
-; CHECK-NEXT: movl %eax, vi32(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToSIF128_I32:
+; X32: calll __fixtfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToSIF128_I32:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixtfsi
+; X64-NEXT: movl %eax, vi32(%rip)
+; X64: retq
}
define void @TestFPToUIF128_U32() {
@@ -56,11 +73,15 @@ entry:
%conv = fptoui fp128 %0 to i32
store i32 %conv, i32* @vu32, align 4
ret void
-; CHECK-LABEL: TestFPToUIF128_U32:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixunstfsi
-; CHECK-NEXT: movl %eax, vu32(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToUIF128_U32:
+; X32: calll __fixunstfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToUIF128_U32:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixunstfsi
+; X64-NEXT: movl %eax, vu32(%rip)
+; X64: retq
}
define void @TestFPToSIF128_I64() {
@@ -70,12 +91,16 @@ entry:
%conv1 = sext i32 %conv to i64
store i64 %conv1, i64* @vi64, align 8
ret void
-; CHECK-LABEL: TestFPToSIF128_I64:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixtfsi
-; CHECK-NEXT: cltq
-; CHECK-NEXT: movq %rax, vi64(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToSIF128_I64:
+; X32: calll __fixtfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToSIF128_I64:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixtfsi
+; X64-NEXT: cltq
+; X64-NEXT: movq %rax, vi64(%rip)
+; X64: retq
}
define void @TestFPToUIF128_U64() {
@@ -85,12 +110,16 @@ entry:
%conv1 = zext i32 %conv to i64
store i64 %conv1, i64* @vu64, align 8
ret void
-; CHECK-LABEL: TestFPToUIF128_U64:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __fixunstfsi
-; CHECK-NEXT: movl %eax, %eax
-; CHECK-NEXT: movq %rax, vu64(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPToUIF128_U64:
+; X32: calll __fixunstfsi
+; X32: retl
+;
+; X64-LABEL: TestFPToUIF128_U64:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __fixunstfsi
+; X64-NEXT: movl %eax, %eax
+; X64-NEXT: movq %rax, vu64(%rip)
+; X64: retq
}
define void @TestFPTruncF128_F32() {
@@ -99,11 +128,16 @@ entry:
%conv = fptrunc fp128 %0 to float
store float %conv, float* @vf32, align 4
ret void
-; CHECK-LABEL: TestFPTruncF128_F32:
-; CHECK: movaps vf128(%rip), %xmm0
-; CHECK-NEXT: callq __trunctfsf2
-; CHECK-NEXT: movss %xmm0, vf32(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPTruncF128_F32:
+; X32: calll __trunctfsf2
+; X32: fstps vf32
+; X32: retl
+;
+; X64-LABEL: TestFPTruncF128_F32:
+; X64: movaps vf128(%rip), %xmm0
+; X64-NEXT: callq __trunctfsf2
+; X64-NEXT: movss %xmm0, vf32(%rip)
+; X64: retq
}
define void @TestFPTruncF128_F64() {
@@ -112,11 +146,16 @@ entry:
%conv = fptrunc fp128 %0 to double
store double %conv, double* @vf64, align 8
ret void
-; CHECK-LABEL: TestFPTruncF128_F64:
-; CHECK: movapd vf128(%rip), %xmm0
-; CHECK-NEXT: callq __trunctfdf2
-; CHECK-NEXT: movsd %xmm0, vf64(%rip)
-; CHECK: retq
+; X32-LABEL: TestFPTruncF128_F64:
+; X32: calll __trunctfdf2
+; X32: fstpl vf64
+; X32: retl
+;
+; X64-LABEL: TestFPTruncF128_F64:
+; X64: movapd vf128(%rip), %xmm0
+; X64-NEXT: callq __trunctfdf2
+; X64-NEXT: movsd %xmm0, vf64(%rip)
+; X64: retq
}
define void @TestSIToFPI32_F128() {
@@ -125,11 +164,15 @@ entry:
%conv = sitofp i32 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestSIToFPI32_F128:
-; CHECK: movl vi32(%rip), %edi
-; CHECK-NEXT: callq __floatsitf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestSIToFPI32_F128:
+; X32: calll __floatsitf
+; X32: retl
+;
+; X64-LABEL: TestSIToFPI32_F128:
+; X64: movl vi32(%rip), %edi
+; X64-NEXT: callq __floatsitf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestUIToFPU32_F128() #2 {
@@ -138,11 +181,15 @@ entry:
%conv = uitofp i32 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestUIToFPU32_F128:
-; CHECK: movl vu32(%rip), %edi
-; CHECK-NEXT: callq __floatunsitf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestUIToFPU32_F128:
+; X32: calll __floatunsitf
+; X32: retl
+;
+; X64-LABEL: TestUIToFPU32_F128:
+; X64: movl vu32(%rip), %edi
+; X64-NEXT: callq __floatunsitf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestSIToFPI64_F128(){
@@ -151,11 +198,15 @@ entry:
%conv = sitofp i64 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestSIToFPI64_F128:
-; CHECK: movq vi64(%rip), %rdi
-; CHECK-NEXT: callq __floatditf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestSIToFPI64_F128:
+; X32: calll __floatditf
+; X32: retl
+;
+; X64-LABEL: TestSIToFPI64_F128:
+; X64: movq vi64(%rip), %rdi
+; X64-NEXT: callq __floatditf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define void @TestUIToFPU64_F128() #2 {
@@ -164,11 +215,15 @@ entry:
%conv = uitofp i64 %0 to fp128
store fp128 %conv, fp128* @vf128, align 16
ret void
-; CHECK-LABEL: TestUIToFPU64_F128:
-; CHECK: movq vu64(%rip), %rdi
-; CHECK-NEXT: callq __floatunditf
-; CHECK-NEXT: movaps %xmm0, vf128(%rip)
-; CHECK: retq
+; X32-LABEL: TestUIToFPU64_F128:
+; X32: calll __floatunditf
+; X32: retl
+;
+; X64-LABEL: TestUIToFPU64_F128:
+; X64: movq vu64(%rip), %rdi
+; X64-NEXT: callq __floatunditf
+; X64-NEXT: movaps %xmm0, vf128(%rip)
+; X64: retq
}
define i32 @TestConst128(fp128 %v) {
@@ -176,11 +231,16 @@ entry:
%cmp = fcmp ogt fp128 %v, 0xL00000000000000003FFF000000000000
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: TestConst128:
-; CHECK: movaps {{.*}}, %xmm1
-; CHECK-NEXT: callq __gttf2
-; CHECK-NEXT: test
-; CHECK: retq
+; X32-LABEL: TestConst128:
+; X32: calll __gttf2
+; X32: retl
+;
+; X64-LABEL: TestConst128:
+; X64: movaps {{.*}}, %xmm1
+; X64-NEXT: callq __gttf2
+; X64-NEXT: xorl
+; X64-NEXT: test
+; X64: retq
}
; C code:
@@ -207,17 +267,21 @@ entry:
%cmp = icmp eq i32 %or, 0
%conv = zext i1 %cmp to i32
ret i32 %conv
-; CHECK-LABEL: TestBits128:
-; CHECK: movaps %xmm0, %xmm1
-; CHECK-NEXT: callq __multf3
-; CHECK-NEXT: movaps %xmm0, (%rsp)
-; CHECK-NEXT: movq (%rsp),
-; CHECK-NEXT: movq %
-; CHECK-NEXT: shrq $32,
-; CHECK: orl
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: movzbl %al, %eax
-; CHECK: retq
+; X32-LABEL: TestBits128:
+; X32: calll __multf3
+; X32: retl
+;
+; X64-LABEL: TestBits128:
+; X64: movaps %xmm0, %xmm1
+; X64-NEXT: callq __multf3
+; X64-NEXT: movaps %xmm0, (%rsp)
+; X64-NEXT: movq (%rsp),
+; X64-NEXT: movq %
+; X64-NEXT: shrq $32,
+; X64: xorl %eax, %eax
+; X64-NEXT: orl
+; X64-NEXT: sete %al
+; X64: retq
;
; If TestBits128 fails due to any llvm or clang change,
; please make sure the original simplified C code will
@@ -244,12 +308,19 @@ entry:
%add = add i128 %or, 3
%0 = bitcast i128 %add to fp128
ret fp128 %0
-; CHECK-LABEL: TestPair128:
-; CHECK: addq $3, %rsi
-; CHECK: movq %rsi, -24(%rsp)
-; CHECK: movq %rdi, -16(%rsp)
-; CHECK: movaps -24(%rsp), %xmm0
-; CHECK-NEXT: retq
+; X32-LABEL: TestPair128:
+; X32: addl
+; X32-NEXT: adcl
+; X32-NEXT: adcl
+; X32-NEXT: adcl
+; X32: retl
+;
+; X64-LABEL: TestPair128:
+; X64: addq $3, %rsi
+; X64: movq %rsi, -24(%rsp)
+; X64: movq %rdi, -16(%rsp)
+; X64: movaps -24(%rsp), %xmm0
+; X64-NEXT: retq
}
define fp128 @TestTruncCopysign(fp128 %x, i32 %n) {
@@ -266,12 +337,24 @@ if.then: ; preds = %entry
cleanup: ; preds = %entry, %if.then
%retval.0 = phi fp128 [ %conv1, %if.then ], [ %x, %entry ]
ret fp128 %retval.0
-; CHECK-LABEL: TestTruncCopysign:
-; CHECK: callq __trunctfdf2
-; CHECK-NEXT: andpd {{.*}}, %xmm0
-; CHECK-NEXT: orpd {{.*}}, %xmm0
-; CHECK-NEXT: callq __extenddftf2
-; CHECK: retq
+; X32-LABEL: TestTruncCopysign:
+; X32: calll __trunctfdf2
+; X32: fstpl
+; X32: flds
+; X32: flds
+; X32: fstp
+; X32: fldz
+; X32: fstp
+; X32: fstpl
+; X32: calll __extenddftf2
+; X32: retl
+;
+; X64-LABEL: TestTruncCopysign:
+; X64: callq __trunctfdf2
+; X64-NEXT: andpd {{.*}}, %xmm0
+; X64-NEXT: orpd {{.*}}, %xmm0
+; X64-NEXT: callq __extenddftf2
+; X64: retq
}
declare double @copysign(double, double) #1
diff --git a/test/CodeGen/X86/fp128-compare.ll b/test/CodeGen/X86/fp128-compare.ll
index b5d4fbe1b74e..6ad3b74aeafa 100644
--- a/test/CodeGen/X86/fp128-compare.ll
+++ b/test/CodeGen/X86/fp128-compare.ll
@@ -8,8 +8,9 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128GT:
; CHECK: callq __gttf2
-; CHECK: setg %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: setg %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -20,9 +21,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128GE:
; CHECK: callq __getf2
+; CHECK: xorl %ecx, %ecx
; CHECK: testl %eax, %eax
-; CHECK: setns %al
-; CHECK: movzbl %al, %eax
+; CHECK: setns %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -48,9 +50,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128LE:
; CHECK: callq __letf2
-; CHECK-NEXT: testl %eax, %eax
-; CHECK: setle %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: testl %eax, %eax
+; CHECK: setle %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -61,9 +64,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128EQ:
; CHECK: callq __eqtf2
-; CHECK-NEXT: testl %eax, %eax
-; CHECK: sete %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: testl %eax, %eax
+; CHECK: sete %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -74,9 +78,10 @@ entry:
ret i32 %conv
; CHECK-LABEL: TestComp128NE:
; CHECK: callq __netf2
-; CHECK-NEXT: testl %eax, %eax
-; CHECK: setne %al
-; CHECK: movzbl %al, %eax
+; CHECK: xorl %ecx, %ecx
+; CHECK: testl %eax, %eax
+; CHECK: setne %cl
+; CHECK: movl %ecx, %eax
; CHECK: retq
}
@@ -86,8 +91,8 @@ entry:
%cond = select i1 %cmp, fp128 %x, fp128 %y
ret fp128 %cond
; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm1
; CHECK: movaps %xmm0
+; CHECK: movaps %xmm1
; CHECK: callq __gttf2
; CHECK: movaps {{.*}}, %xmm0
; CHECK: testl %eax, %eax
diff --git a/test/CodeGen/X86/fp128-select.ll b/test/CodeGen/X86/fp128-select.ll
new file mode 100644
index 000000000000..dc41d5095a71
--- /dev/null
+++ b/test/CodeGen/X86/fp128-select.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s --check-prefix=MMX
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=MMX
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define void @test_select(fp128* %p, fp128* %q, i1 zeroext %c) {
+; MMX-LABEL: test_select:
+; MMX: # BB#0:
+; MMX-NEXT: testb %dl, %dl
+; MMX-NEXT: jne .LBB0_1
+; MMX-NEXT: # BB#2:
+; MMX-NEXT: movaps {{.*}}(%rip), %xmm0
+; MMX-NEXT: movaps %xmm0, (%rsi)
+; MMX-NEXT: retq
+; MMX-NEXT: .LBB0_1:
+; MMX-NEXT: movaps (%rdi), %xmm0
+; MMX-NEXT: movaps %xmm0, (%rsi)
+; MMX-NEXT: retq
+;
+; CHECK-LABEL: test_select:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovneq (%rdi), %rax
+; CHECK-NEXT: movabsq $9223231299366420480, %rcx # imm = 0x7FFF800000000000
+; CHECK-NEXT: cmovneq 8(%rdi), %rcx
+; CHECK-NEXT: movq %rcx, 8(%rsi)
+; CHECK-NEXT: movq %rax, (%rsi)
+; CHECK-NEXT: retq
+ %a = load fp128, fp128* %p, align 2
+ %r = select i1 %c, fp128 %a, fp128 0xL00000000000000007FFF800000000000
+ store fp128 %r, fp128* %q
+ ret void
+}
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
index 2ee67dc190bd..874cc7ce7f3f 100644
--- a/test/CodeGen/X86/fpstack-debuginstr-kill.ll
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -43,11 +43,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!24, !25}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !21, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !21, imports: !2)
!1 = !DIFile(filename: "fpu_ieee.cpp", directory: "x87stackifier")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "fpuop_arithmetic", linkageName: "_Z16fpuop_arithmeticjj", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 13, file: !5, scope: !6, type: !7, variables: !10)
+!4 = distinct !DISubprogram(name: "fpuop_arithmetic", linkageName: "_Z16fpuop_arithmeticjj", line: 11, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 13, file: !5, scope: !6, type: !7, variables: !10)
!5 = !DIFile(filename: "f1.cpp", directory: "x87stackifier")
!6 = !DIFile(filename: "f1.cpp", directory: "x87stackifier")
!7 = !DISubroutineType(types: !8)
@@ -60,7 +59,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!14 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_extended", line: 3, file: !5, baseType: !15)
!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_register", line: 2, file: !5, baseType: !16)
!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "uae_f64", line: 1, file: !5, baseType: !17)
-!17 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
+!17 = !DIBasicType(tag: DW_TAG_base_type, name: "long double", size: 128, align: 128, encoding: DW_ATE_float)
!18 = !DILocalVariable(name: "a", line: 15, scope: !4, file: !6, type: !19)
!19 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!20 = !DILocalVariable(name: "value", line: 16, scope: !4, file: !6, type: !14)
diff --git a/test/CodeGen/X86/frame-order.ll b/test/CodeGen/X86/frame-order.ll
new file mode 100644
index 000000000000..33aaee2951c3
--- /dev/null
+++ b/test/CodeGen/X86/frame-order.ll
@@ -0,0 +1,122 @@
+; RUN: llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print < %s | FileCheck %s
+; RUN: opt -strip -S < %s | llc -mtriple=x86_64-linux-gnueabi -disable-debug-info-print | FileCheck %s
+
+; This test checks if the code is generated correctly with and without debug info.
+
+; This LL file was generated by running 'clang -g -gcodeview' on the
+; following code:
+; 1: extern "C" volatile int x;
+; 2: extern "C" void capture(int *p);
+; 3: static __forceinline inline void will_be_inlined() {
+; 4: int v = 3;
+; 5: capture(&v);
+; 6: }
+; 7: extern "C" void f(int param) {
+; 8: if (param) {
+; 9: int a = 42;
+; 10: will_be_inlined();
+; 11: capture(&a);
+; 12: } else {
+; 13: int b = 42;
+; 14: will_be_inlined();
+; 15: capture(&b);
+; 16: }
+; 17: }
+
+; ModuleID = 't.cpp'
+
+; Function Attrs: nounwind uwtable
+define void @f(i32 %param) #0 !dbg !4 {
+entry:
+ %v.i1 = alloca i32, align 4
+ call void @llvm.dbg.declare(metadata i32* %v.i1, metadata !15, metadata !16), !dbg !17
+ %v.i = alloca i32, align 4
+ call void @llvm.dbg.declare(metadata i32* %v.i, metadata !15, metadata !16), !dbg !21
+ %param.addr = alloca i32, align 4
+ %a = alloca i32, align 4
+ %b = alloca i32, align 4
+ store i32 %param, i32* %param.addr, align 4
+ call void @llvm.dbg.declare(metadata i32* %param.addr, metadata !24, metadata !16), !dbg !25
+ %0 = load i32, i32* %param.addr, align 4, !dbg !26
+ %tobool = icmp ne i32 %0, 0, !dbg !26
+ br i1 %tobool, label %if.then, label %if.else, !dbg !27
+
+;CHECK: movl [[REG:.*]], 20(%rsp)
+;CHECK: je [[LABEL:.*]]
+
+if.then: ; preds = %entry
+ call void @llvm.dbg.declare(metadata i32* %a, metadata !28, metadata !16), !dbg !29
+ store i32 42, i32* %a, align 4, !dbg !29
+ store i32 3, i32* %v.i, align 4, !dbg !21
+ call void @capture(i32* %v.i) #3, !dbg !30
+ call void @capture(i32* %a), !dbg !31
+ br label %if.end, !dbg !32
+
+;CHECK: movl $3, 12(%rsp)
+
+if.else: ; preds = %entry
+ call void @llvm.dbg.declare(metadata i32* %b, metadata !33, metadata !16), !dbg !34
+ store i32 42, i32* %b, align 4, !dbg !34
+ store i32 3, i32* %v.i1, align 4, !dbg !17
+ call void @capture(i32* %v.i1) #3, !dbg !35
+ call void @capture(i32* %b), !dbg !36
+ br label %if.end
+
+;CHECK: [[LABEL]]:
+;CHECK: movl $3, 16(%rsp)
+
+if.end: ; preds = %if.else, %if.then
+ ret void, !dbg !37
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @capture(i32*) #2
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12, !13}
+!llvm.ident = !{!14}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.9.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.cpp", directory: "D:\5Csrc\5Cllvm\5Cbuild")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 7, type: !5, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = distinct !DISubprogram(name: "will_be_inlined", linkageName: "\01?will_be_inlined@@YAXXZ", scope: !1, file: !1, line: 3, type: !9, isLocal: true, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !{i32 2, !"CodeView", i32 1}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{i32 1, !"PIC Level", i32 2}
+!14 = !{!"clang version 3.9.0 "}
+!15 = !DILocalVariable(name: "v", scope: !8, file: !1, line: 4, type: !7)
+!16 = !DIExpression()
+!17 = !DILocation(line: 4, column: 7, scope: !8, inlinedAt: !18)
+!18 = distinct !DILocation(line: 14, column: 5, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 12, column: 10)
+!20 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 7)
+!21 = !DILocation(line: 4, column: 7, scope: !8, inlinedAt: !22)
+!22 = distinct !DILocation(line: 10, column: 5, scope: !23)
+!23 = distinct !DILexicalBlock(scope: !20, file: !1, line: 8, column: 14)
+!24 = !DILocalVariable(name: "param", arg: 1, scope: !4, file: !1, line: 7, type: !7)
+!25 = !DILocation(line: 7, column: 23, scope: !4)
+!26 = !DILocation(line: 8, column: 7, scope: !20)
+!27 = !DILocation(line: 8, column: 7, scope: !4)
+!28 = !DILocalVariable(name: "a", scope: !23, file: !1, line: 9, type: !7)
+!29 = !DILocation(line: 9, column: 9, scope: !23)
+!30 = !DILocation(line: 5, column: 3, scope: !8, inlinedAt: !22)
+!31 = !DILocation(line: 11, column: 5, scope: !23)
+!32 = !DILocation(line: 12, column: 3, scope: !23)
+!33 = !DILocalVariable(name: "b", scope: !19, file: !1, line: 13, type: !7)
+!34 = !DILocation(line: 13, column: 9, scope: !19)
+!35 = !DILocation(line: 5, column: 3, scope: !8, inlinedAt: !18)
+!36 = !DILocation(line: 15, column: 5, scope: !19)
+!37 = !DILocation(line: 17, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/ga-offset.ll b/test/CodeGen/X86/ga-offset.ll
index 934c14921e99..3613cf8bf598 100644
--- a/test/CodeGen/X86/ga-offset.ll
+++ b/test/CodeGen/X86/ga-offset.ll
@@ -1,18 +1,11 @@
-; RUN: llc < %s -march=x86 > %t
-; RUN: not grep lea %t
-; RUN: not grep add %t
-; RUN: grep mov %t | count 1
-; RUN: llc < %s -mtriple=x86_64-linux -relocation-model=static > %t
-; RUN: not grep lea %t
-; RUN: not grep add %t
-; RUN: grep mov %t | count 1
-
-; This store should fold to a single mov instruction.
+; RUN: llc < %s -mtriple=x86_64-linux -relocation-model=static | FileCheck %s
@ptr = global i32* null
@dst = global [131072 x i32] zeroinitializer
define void @foo() nounwind {
+; This store should fold to a single mov instruction.
+; CHECK: movq $dst+64, ptr(%rip)
store i32* getelementptr ([131072 x i32], [131072 x i32]* @dst, i32 0, i32 16), i32** @ptr
ret void
}
diff --git a/test/CodeGen/X86/ga-offset2.ll b/test/CodeGen/X86/ga-offset2.ll
new file mode 100644
index 000000000000..bc4a3493ff6f
--- /dev/null
+++ b/test/CodeGen/X86/ga-offset2.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s
+
+@var = external hidden global i32
+@p = external hidden global i32*
+
+define void @f() {
+; CHECK: movl $_var+40, _p
+ store i32* getelementptr (i32, i32* @var, i64 10), i32** @p
+ ret void
+}
diff --git a/test/CodeGen/X86/global-access-pie.ll b/test/CodeGen/X86/global-access-pie.ll
new file mode 100644
index 000000000000..0e29d605476d
--- /dev/null
+++ b/test/CodeGen/X86/global-access-pie.ll
@@ -0,0 +1,123 @@
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X32 %s
+
+; External Linkage
+@a = global i32 0, align 4
+
+define i32 @my_access_global_a() #0 {
+; X32-LABEL: my_access_global_a:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl a@GOTOFF(%eax), %eax
+; X64-LABEL: my_access_global_a:
+; X64: movl a(%rip), %eax
+
+entry:
+ %0 = load i32, i32* @a, align 4
+ ret i32 %0
+}
+
+; WeakAny Linkage
+@b = weak global i32 0, align 4
+
+define i32 @my_access_global_b() #0 {
+; X32-LABEL: my_access_global_b:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl b@GOTOFF(%eax), %eax
+; X64-LABEL: my_access_global_b:
+; X64: movl b(%rip), %eax
+
+entry:
+ %0 = load i32, i32* @b, align 4
+ ret i32 %0
+}
+
+; Internal Linkage
+@c = internal global i32 0, align 4
+
+define i32 @my_access_global_c() #0 {
+; X32-LABEL: my_access_global_c:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl c@GOTOFF(%eax), %eax
+; X64-LABEL: my_access_global_c:
+; X64: movl c(%rip), %eax
+
+entry:
+ %0 = load i32, i32* @c, align 4
+ ret i32 %0
+}
+
+; External Linkage, only declaration.
+@d = external global i32, align 4
+
+define i32 @my_access_global_load_d() #0 {
+; X32-LABEL: my_access_global_load_d:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl d@GOT(%eax), %eax
+; X32-NEXT: movl (%eax), %eax
+; X64-LABEL: my_access_global_load_d:
+; X64: movq d@GOTPCREL(%rip), %rax
+; X64-NEXT: movl (%rax), %eax
+
+entry:
+ %0 = load i32, i32* @d, align 4
+ ret i32 %0
+}
+
+; External Linkage, only declaration, store a value.
+
+define i32 @my_access_global_store_d() #0 {
+; X32-LABEL: my_access_global_store_d:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32-NEXT: movl d@GOT(%eax), %eax
+; X32-NEXT: movl $2, (%eax)
+; X64-LABEL: my_access_global_store_d:
+; X64: movq d@GOTPCREL(%rip), %rax
+; X64-NEXT: movl $2, (%rax)
+
+entry:
+ store i32 2, i32* @d, align 4
+ ret i32 0
+}
+
+; External Linkage, function pointer access.
+declare i32 @access_fp(i32 ()*)
+declare i32 @foo()
+
+define i32 @my_access_fp_foo() #0 {
+; X32-LABEL: my_access_fp_foo:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %ebx
+; X32-NEXT: movl foo@GOT(%ebx), %eax
+; X64-LABEL: my_access_fp_foo:
+; X64: movq foo@GOTPCREL(%rip), %rdi
+
+entry:
+ %call = call i32 @access_fp(i32 ()* @foo)
+ ret i32 %call
+}
+
+; LinkOnceODR Linkage, function pointer access.
+
+$bar = comdat any
+
+define linkonce_odr i32 @bar() comdat {
+entry:
+ ret i32 0
+}
+
+define i32 @my_access_fp_bar() #0 {
+; X32-LABEL: my_access_fp_bar:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %ebx
+; X32-NEXT: leal bar@GOTOFF(%ebx), %eax
+; X64-LABEL: my_access_fp_bar:
+; X64: leaq bar(%rip), %rdi
+
+entry:
+ %call = call i32 @access_fp(i32 ()* @bar)
+ ret i32 %call
+}
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 92440f2b3316..ea6df468ceb2 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -93,11 +93,13 @@ bb7:
; DARWIN64: Lfunc_end
; DARWIN64-NEXT: .cfi_endproc
; DARWIN64-NOT: .section
+; DARWIN64: .data_region jt32
; DARWIN64: LJTI{{.*}}:
; DARWIN64-NEXT: .long
; DARWIN64-NEXT: .long
; DARWIN64-NEXT: .long
; DARWIN64-NEXT: .long
+; DARWIN64-NEXT: .end_data_region
; DARWIN64-NEXT: .section __TEXT,__gcc_except_tab
; int G1;
@@ -241,13 +243,13 @@ bb7:
; DARWIN: .section __DATA,__data{{$}}
; DARWIN: .globl _G10
; DARWIN: .weak_definition _G10
-; DARWIN: .align 5
+; DARWIN: .p2align 5
; DARWIN: _G10:
; DARWIN: .space 400
; LINUX: .bss
; LINUX: .weak G10
-; LINUX: .align 32
+; LINUX: .p2align 5
; LINUX: G10:
; LINUX: .zero 400
@@ -298,3 +300,32 @@ bb7:
; WIN32-SECTIONS: .section .rdata,"dr",one_only,_G15
; WIN32-SECTIONS: _G15:
+
+@G16 = unnamed_addr constant i256 0
+
+; LINUX: .section .rodata.cst32,"aM",@progbits,32
+; LINUX: G16:
+
+; LINUX-SECTIONS: .section .rodata.cst32,"aM",@progbits,32
+; LINUX-SECTIONS: G16:
+
+; WIN32-SECTIONS: .section .rdata,"dr",one_only,_G16
+; WIN32-SECTIONS: _G16:
+
+; PR26570
+
+@G17 = internal global i8 0
+; LINUX: .type G17,@object
+; LINUX: .local G17
+; LINUX: .comm G17,1,1
+
+; DARWIN: .zerofill __DATA,__bss,_G17,1,0
+
+; LINUX-SECTIONS: .type G17,@object
+; LINUX-SECTIONS: .section .bss.G17,"aw",@nobits
+; LINUX-SECTIONS: .byte 0
+; LINUX-SECTIONS: .size G17, 1
+
+; WIN32-SECTIONS: .section .bss,"bw",one_only,_G17
+; WIN32-SECTIONS: _G17:
+; WIN32-SECTIONS:.byte 0
diff --git a/test/CodeGen/X86/h-registers-3.ll b/test/CodeGen/X86/h-registers-3.ll
index 58b02b7df21f..819f21625abf 100644
--- a/test/CodeGen/X86/h-registers-3.ll
+++ b/test/CodeGen/X86/h-registers-3.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep mov | count 1
-; RUN: llc < %s -march=x86-64 | grep mov | count 1
-; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | grep mov | count 1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=X32
define zeroext i8 @foo() nounwind ssp {
entry:
@@ -8,6 +8,28 @@ entry:
%1 = lshr i16 %0, 8
%2 = trunc i16 %1 to i8
ret i8 %2
+
+; X86-LABEL: foo
+; X86: calll
+; X86-NEXT: movb %ah, %al
+; X86-NEXT: addl $12, %esp
+; X86-NEXT: retl
+
+; X64-LABEL: foo
+; X64: callq
+; X64-NEXT: # kill
+; X64-NEXT: shrl $8, %eax
+; X64-NEXT: # kill
+; X64-NEXT: popq
+; X64-NEXT: retq
+
+; X32-LABEL: foo
+; X32: callq
+; X32-NEXT: # kill
+; X32-NEXT: shrl $8, %eax
+; X32-NEXT: # kill
+; X32-NEXT: popq
+; X32-NEXT: retq
}
declare zeroext i16 @bar(...)
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index c6bac5858807..517a663bc815 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -1,11 +1,19 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE3
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse3,+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSSE3
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -march=x86-64 -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
-
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hadd_ps_test1:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_ps_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
%vecext1 = extractelement <4 x float> %A, i32 1
%add = fadd float %vecext, %vecext1
@@ -24,12 +32,17 @@ define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hadd_ps_test1
-; CHECK: haddps
-; CHECK-NEXT: ret
-
define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hadd_ps_test2:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_ps_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%add = fadd float %vecext, %vecext1
@@ -48,12 +61,17 @@ define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hadd_ps_test2
-; CHECK: haddps
-; CHECK-NEXT: ret
-
define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hsub_ps_test1:
+; SSE: # BB#0:
+; SSE-NEXT: hsubps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_ps_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
%vecext1 = extractelement <4 x float> %A, i32 1
%sub = fsub float %vecext, %vecext1
@@ -72,12 +90,17 @@ define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hsub_ps_test1
-; CHECK: hsubps
-; CHECK-NEXT: ret
-
define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: hsub_ps_test2:
+; SSE: # BB#0:
+; SSE-NEXT: hsubps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_ps_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%sub = fsub float %vecext, %vecext1
@@ -96,12 +119,46 @@ define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: hsub_ps_test2
-; CHECK: hsubps
-; CHECK-NEXT: ret
-
define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phadd_d_test1:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: addl %eax, %edi
+; SSE3-NEXT: movd %edi, %xmm0
+; SSE3-NEXT: movd %edx, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %esi, %xmm2
+; SSE3-NEXT: movd %ecx, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phadd_d_test1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phadd_d_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%add = add i32 %vecext, %vecext1
@@ -120,15 +177,46 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phadd_d_test1
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; AVX: vphaddd
-; AVX2 vphaddd
-; CHECK: ret
-
define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phadd_d_test2:
+; SSE3: # BB#0:
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %esi, %xmm0
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: movd %xmm1, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %ecx, %xmm1
+; SSE3-NEXT: movd %edx, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phadd_d_test2:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phadd_d_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
%vecext1 = extractelement <4 x i32> %A, i32 3
%add = add i32 %vecext, %vecext1
@@ -147,15 +235,46 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phadd_d_test2
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; AVX: vphaddd
-; AVX2 vphaddd
-; CHECK: ret
-
define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phsub_d_test1:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: subl %ecx, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: subl %edx, %ecx
+; SSE3-NEXT: movd %xmm1, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: subl %esi, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: subl %edi, %esi
+; SSE3-NEXT: movd %esi, %xmm0
+; SSE3-NEXT: movd %ecx, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %edx, %xmm2
+; SSE3-NEXT: movd %eax, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phsub_d_test1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phsub_d_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%sub = sub i32 %vecext, %vecext1
@@ -174,15 +293,46 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phsub_d_test1
-; SSE3-NOT: phsubd
-; SSSE3: phsubd
-; AVX: vphsubd
-; AVX2 vphsubd
-; CHECK: ret
-
define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE3-LABEL: phsub_d_test2:
+; SSE3: # BB#0:
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm2, %ecx
+; SSE3-NEXT: subl %ecx, %eax
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: subl %edx, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: subl %esi, %edx
+; SSE3-NEXT: movd %edx, %xmm0
+; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: subl %edx, %eax
+; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: phsub_d_test2:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: phsub_d_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
%vecext1 = extractelement <4 x i32> %A, i32 3
%sub = sub i32 %vecext, %vecext1
@@ -201,15 +351,17 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: phsub_d_test2
-; SSE3-NOT: phsubd
-; SSSE3: phsubd
-; AVX: vphsubd
-; AVX2 vphsubd
-; CHECK: ret
-
define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hadd_pd_test1:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_pd_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
%vecext1 = extractelement <2 x double> %A, i32 1
%add = fadd double %vecext, %vecext1
@@ -220,12 +372,17 @@ define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hadd_pd_test1
-; CHECK: haddpd
-; CHECK-NEXT: ret
-
define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hadd_pd_test2:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hadd_pd_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 1
%vecext1 = extractelement <2 x double> %A, i32 0
%add = fadd double %vecext, %vecext1
@@ -236,12 +393,17 @@ define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hadd_pd_test2
-; CHECK: haddpd
-; CHECK-NEXT: ret
-
define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hsub_pd_test1:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_pd_test1:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
%vecext1 = extractelement <2 x double> %A, i32 1
%sub = fsub double %vecext, %vecext1
@@ -252,12 +414,17 @@ define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hsub_pd_test1
-; CHECK: hsubpd
-; CHECK-NEXT: ret
-
define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: hsub_pd_test2:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: hsub_pd_test2:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %B, i32 0
%vecext1 = extractelement <2 x double> %B, i32 1
%sub = fsub double %vecext, %vecext1
@@ -268,12 +435,23 @@ define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: hsub_pd_test2
-; CHECK: hsubpd
-; CHECK-NEXT: ret
-
define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
+; SSE-LABEL: avx_vhadd_pd_test:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm1, %xmm0
+; SSE-NEXT: haddpd %xmm3, %xmm2
+; SSE-NEXT: movapd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhadd_pd_test:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%add = fadd double %vecext, %vecext1
@@ -292,19 +470,23 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_vhadd_pd_test
-; SSE3: haddpd
-; SSE3-NEXT: haddpd
-; SSSE3: haddpd
-; SSSE3: haddpd
-; AVX: vhaddpd
-; AVX: vhaddpd
-; AVX2: vhaddpd
-; AVX2: vhaddpd
-; CHECK: ret
-
define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
+; SSE-LABEL: avx_vhsub_pd_test:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm1, %xmm0
+; SSE-NEXT: hsubpd %xmm3, %xmm2
+; SSE-NEXT: movapd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhsub_pd_test:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%sub = fsub double %vecext, %vecext1
@@ -323,19 +505,86 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_vhsub_pd_test
-; SSE3: hsubpd
-; SSE3-NEXT: hsubpd
-; SSSE3: hsubpd
-; SSSE3-NEXT: hsubpd
-; AVX: vhsubpd
-; AVX: vhsubpd
-; AVX2: vhsubpd
-; AVX2: vhsubpd
-; CHECK: ret
-
define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
+; SSE3-LABEL: avx2_vphadd_d_test:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm4, %r8d
+; SSE3-NEXT: addl %ecx, %r8d
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm4, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r9d
+; SSE3-NEXT: addl %edx, %r9d
+; SSE3-NEXT: movd %xmm1, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %esi, %r10d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: addl %esi, %edi
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %edi, %xmm0
+; SSE3-NEXT: movd %r9d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r8d, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: movd %esi, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT: movd %edx, %xmm3
+; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_vphadd_d_test:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: phaddd %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_vphadd_d_test:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_vphadd_d_test:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %A, i32 0
%vecext1 = extractelement <8 x i32> %A, i32 1
%add = add i32 %vecext, %vecext1
@@ -370,17 +619,154 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
ret <8 x i32> %vecinit29
}
-; CHECK-LABEL: avx2_vphadd_d_test
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; SSSE3-NEXT: phaddd
-; AVX: vphaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; AVX2: vphaddd
-; CHECK: ret
define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
+; SSE3-LABEL: avx2_vphadd_w_test:
+; SSE3: # BB#0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: .Ltmp0:
+; SSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: .Ltmp1:
+; SSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: .Ltmp2:
+; SSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: .Ltmp3:
+; SSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: .Ltmp4:
+; SSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE3-NEXT: pushq %rbx
+; SSE3-NEXT: .Ltmp5:
+; SSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE3-NEXT: .Ltmp6:
+; SSE3-NEXT: .cfi_offset %rbx, -56
+; SSE3-NEXT: .Ltmp7:
+; SSE3-NEXT: .cfi_offset %r12, -48
+; SSE3-NEXT: .Ltmp8:
+; SSE3-NEXT: .cfi_offset %r13, -40
+; SSE3-NEXT: .Ltmp9:
+; SSE3-NEXT: .cfi_offset %r14, -32
+; SSE3-NEXT: .Ltmp10:
+; SSE3-NEXT: .cfi_offset %r15, -24
+; SSE3-NEXT: .Ltmp11:
+; SSE3-NEXT: .cfi_offset %rbp, -16
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pextrw $1, %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm0, %eax
+; SSE3-NEXT: pextrw $3, %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pextrw $4, %xmm0, %eax
+; SSE3-NEXT: pextrw $5, %xmm0, %r10d
+; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: pextrw $6, %xmm0, %eax
+; SSE3-NEXT: pextrw $7, %xmm0, %r13d
+; SSE3-NEXT: addl %eax, %r13d
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pextrw $1, %xmm1, %r14d
+; SSE3-NEXT: addl %eax, %r14d
+; SSE3-NEXT: pextrw $2, %xmm1, %eax
+; SSE3-NEXT: pextrw $3, %xmm1, %ebp
+; SSE3-NEXT: addl %eax, %ebp
+; SSE3-NEXT: pextrw $4, %xmm1, %eax
+; SSE3-NEXT: pextrw $5, %xmm1, %ebx
+; SSE3-NEXT: addl %eax, %ebx
+; SSE3-NEXT: pextrw $6, %xmm1, %eax
+; SSE3-NEXT: pextrw $7, %xmm1, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pextrw $1, %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm2, %eax
+; SSE3-NEXT: pextrw $3, %xmm2, %r12d
+; SSE3-NEXT: addl %eax, %r12d
+; SSE3-NEXT: pextrw $4, %xmm2, %eax
+; SSE3-NEXT: pextrw $5, %xmm2, %r15d
+; SSE3-NEXT: addl %eax, %r15d
+; SSE3-NEXT: pextrw $6, %xmm2, %eax
+; SSE3-NEXT: pextrw $7, %xmm2, %r8d
+; SSE3-NEXT: addl %eax, %r8d
+; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: pextrw $1, %xmm3, %r9d
+; SSE3-NEXT: addl %eax, %r9d
+; SSE3-NEXT: pextrw $2, %xmm3, %eax
+; SSE3-NEXT: pextrw $3, %xmm3, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: pextrw $4, %xmm3, %eax
+; SSE3-NEXT: pextrw $5, %xmm3, %edi
+; SSE3-NEXT: addl %eax, %edi
+; SSE3-NEXT: pextrw $6, %xmm3, %ecx
+; SSE3-NEXT: pextrw $7, %xmm3, %eax
+; SSE3-NEXT: addl %ecx, %eax
+; SSE3-NEXT: movd %edx, %xmm8
+; SSE3-NEXT: movd %r13d, %xmm3
+; SSE3-NEXT: movd %ebp, %xmm9
+; SSE3-NEXT: movd %r11d, %xmm4
+; SSE3-NEXT: movd %ebx, %xmm10
+; SSE3-NEXT: movd %r10d, %xmm7
+; SSE3-NEXT: movd %r14d, %xmm11
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT: movd %eax, %xmm12
+; SSE3-NEXT: movd %r8d, %xmm6
+; SSE3-NEXT: movd %esi, %xmm13
+; SSE3-NEXT: movd %r12d, %xmm5
+; SSE3-NEXT: movd %edi, %xmm14
+; SSE3-NEXT: movd %r15d, %xmm2
+; SSE3-NEXT: movd %r9d, %xmm15
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: popq %r12
+; SSE3-NEXT: popq %r13
+; SSE3-NEXT: popq %r14
+; SSE3-NEXT: popq %r15
+; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_vphadd_w_test:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm1, %xmm0
+; SSSE3-NEXT: phaddw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_vphadd_w_test:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_vphadd_w_test:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <16 x i16> %a, i32 0
%vecext1 = extractelement <16 x i16> %a, i32 1
%add = add i16 %vecext, %vecext1
@@ -447,20 +833,58 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
ret <16 x i16> %vecinit108
}
-; CHECK-LABEL: avx2_vphadd_w_test
-; SSE3-NOT: phaddw
-; SSSE3: phaddw
-; SSSE3-NEXT: phaddw
-; AVX: vphaddw
-; AVX: vphaddw
-; AVX2: vphaddw
-; AVX2: vphaddw
-; CHECK: ret
-
; Verify that we don't select horizontal subs in the following functions.
define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: not_a_hsub_1:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE-NEXT: movd %xmm2, %ecx
+; SSE-NEXT: subl %ecx, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %ecx
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: subl %edx, %ecx
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: movd %xmm1, %esi
+; SSE-NEXT: subl %esi, %edx
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE-NEXT: movd %xmm0, %esi
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm0, %edi
+; SSE-NEXT: subl %edi, %esi
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movd %edx, %xmm2
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: not_a_hsub_1:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: vpextrd $1, %xmm0, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpextrd $2, %xmm0, %ecx
+; AVX-NEXT: vpextrd $3, %xmm0, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vpextrd $1, %xmm1, %edx
+; AVX-NEXT: vmovd %xmm1, %esi
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: vpextrd $3, %xmm1, %esi
+; AVX-NEXT: vpextrd $2, %xmm1, %edi
+; AVX-NEXT: subl %edi, %esi
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%sub = sub i32 %vecext, %vecext1
@@ -479,12 +903,45 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
ret <4 x i32> %vecinit13
}
-; CHECK-LABEL: not_a_hsub_1
-; CHECK-NOT: phsubd
-; CHECK: ret
-
define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
+; SSE-LABEL: not_a_hsub_2:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm0, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: movapd %xmm0, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE-NEXT: subss %xmm3, %xmm2
+; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT: subss %xmm3, %xmm0
+; SSE-NEXT: movaps %xmm1, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0]
+; SSE-NEXT: subss %xmm4, %xmm3
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: subss %xmm3, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: not_a_hsub_2:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%sub = fsub float %vecext, %vecext1
@@ -503,12 +960,28 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: not_a_hsub_2
-; CHECK-NOT: hsubps
-; CHECK: ret
-
define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
+; SSE-LABEL: not_a_hsub_3:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm1, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: subsd %xmm2, %xmm1
+; SSE-NEXT: movapd %xmm0, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: subsd %xmm0, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: not_a_hsub_3:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %B, i32 0
%vecext1 = extractelement <2 x double> %B, i32 1
%sub = fsub double %vecext, %vecext1
@@ -519,15 +992,21 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
ret <2 x double> %vecinit2
}
-; CHECK-LABEL: not_a_hsub_3
-; CHECK-NOT: hsubpd
-; CHECK: ret
-
; Test AVX horizontal add/sub of packed single/double precision
; floating point values from 256-bit vectors.
define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: avx_vhadd_ps:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm2, %xmm0
+; SSE-NEXT: haddps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhadd_ps:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -562,17 +1041,18 @@ define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
%vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
ret <8 x float> %vecinit29
}
-; CHECK-LABEL: avx_vhadd_ps
-; SSE3: haddps
-; SSE3-NEXT: haddps
-; SSSE3: haddps
-; SSSE3-NEXT: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK: ret
-
define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: avx_vhsub_ps:
+; SSE: # BB#0:
+; SSE-NEXT: hsubps %xmm2, %xmm0
+; SSE-NEXT: hsubps %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_vhsub_ps:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%sub = fsub float %vecext, %vecext1
@@ -607,17 +1087,18 @@ define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
%vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
ret <8 x float> %vecinit29
}
-; CHECK-LABEL: avx_vhsub_ps
-; SSE3: hsubps
-; SSE3-NEXT: hsubps
-; SSSE3: hsubps
-; SSSE3-NEXT: hsubps
-; AVX: vhsubps
-; AVX2: vhsubps
-; CHECK: ret
-
define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: avx_hadd_pd:
+; SSE: # BB#0:
+; SSE-NEXT: haddpd %xmm2, %xmm0
+; SSE-NEXT: haddpd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_hadd_pd:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
%vecext1 = extractelement <4 x double> %a, i32 1
%add = fadd double %vecext, %vecext1
@@ -636,17 +1117,18 @@ define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_hadd_pd
-; SSE3: haddpd
-; SSE3-NEXT: haddpd
-; SSSE3: haddpd
-; SSSE3-NEXT: haddpd
-; AVX: vhaddpd
-; AVX2: vhaddpd
-; CHECK: ret
-
define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: avx_hsub_pd:
+; SSE: # BB#0:
+; SSE-NEXT: hsubpd %xmm2, %xmm0
+; SSE-NEXT: hsubpd %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: avx_hsub_pd:
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
%vecext1 = extractelement <4 x double> %a, i32 1
%sub = fsub double %vecext, %vecext1
@@ -665,19 +1147,83 @@ define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
ret <4 x double> %vecinit13
}
-; CHECK-LABEL: avx_hsub_pd
-; SSE3: hsubpd
-; SSE3-NEXT: hsubpd
-; SSSE3: hsubpd
-; SSSE3-NEXT: hsubpd
-; AVX: vhsubpd
-; AVX2: vhsubpd
-; CHECK: ret
-
; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
+; SSE3-LABEL: avx2_hadd_d:
+; SSE3: # BB#0:
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
+; SSE3-NEXT: movd %xmm4, %r8d
+; SSE3-NEXT: addl %ecx, %r8d
+; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE3-NEXT: movd %xmm4, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r9d
+; SSE3-NEXT: addl %edx, %r9d
+; SSE3-NEXT: movd %xmm2, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r10d
+; SSE3-NEXT: addl %esi, %r10d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edi
+; SSE3-NEXT: addl %esi, %edi
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movd %xmm3, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE3-NEXT: movd %xmm0, %edx
+; SSE3-NEXT: addl %eax, %edx
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
+; SSE3-NEXT: movd %xmm0, %esi
+; SSE3-NEXT: addl %eax, %esi
+; SSE3-NEXT: movd %edi, %xmm0
+; SSE3-NEXT: movd %r9d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r8d, %xmm0
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT: movd %esi, %xmm1
+; SSE3-NEXT: movd %ecx, %xmm2
+; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE3-NEXT: movd %edx, %xmm3
+; SSE3-NEXT: movd %r11d, %xmm1
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_hadd_d:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm2, %xmm0
+; SSSE3-NEXT: phaddd %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_hadd_d:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_hadd_d:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -712,18 +1258,149 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
ret <8 x i32> %vecinit29
}
-; CHECK-LABEL: avx2_hadd_d
-; SSE3-NOT: phaddd
-; SSSE3: phaddd
-; SSSE3-NEXT: phaddd
-; AVX: vphaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; AVX2-NOT: vphaddd
-; CHECK: ret
-
define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
+; SSE3-LABEL: avx2_hadd_w:
+; SSE3: # BB#0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: .Ltmp12:
+; SSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: .Ltmp13:
+; SSE3-NEXT: .cfi_def_cfa_offset 24
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: .Ltmp14:
+; SSE3-NEXT: .cfi_def_cfa_offset 32
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: .Ltmp15:
+; SSE3-NEXT: .cfi_def_cfa_offset 40
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: .Ltmp16:
+; SSE3-NEXT: .cfi_def_cfa_offset 48
+; SSE3-NEXT: pushq %rbx
+; SSE3-NEXT: .Ltmp17:
+; SSE3-NEXT: .cfi_def_cfa_offset 56
+; SSE3-NEXT: .Ltmp18:
+; SSE3-NEXT: .cfi_offset %rbx, -56
+; SSE3-NEXT: .Ltmp19:
+; SSE3-NEXT: .cfi_offset %r12, -48
+; SSE3-NEXT: .Ltmp20:
+; SSE3-NEXT: .cfi_offset %r13, -40
+; SSE3-NEXT: .Ltmp21:
+; SSE3-NEXT: .cfi_offset %r14, -32
+; SSE3-NEXT: .Ltmp22:
+; SSE3-NEXT: .cfi_offset %r15, -24
+; SSE3-NEXT: .Ltmp23:
+; SSE3-NEXT: .cfi_offset %rbp, -16
+; SSE3-NEXT: movd %xmm0, %eax
+; SSE3-NEXT: pextrw $1, %xmm0, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm0, %eax
+; SSE3-NEXT: pextrw $3, %xmm0, %r15d
+; SSE3-NEXT: addl %eax, %r15d
+; SSE3-NEXT: pextrw $4, %xmm0, %eax
+; SSE3-NEXT: pextrw $5, %xmm0, %r14d
+; SSE3-NEXT: addl %eax, %r14d
+; SSE3-NEXT: pextrw $6, %xmm0, %eax
+; SSE3-NEXT: pextrw $7, %xmm0, %r13d
+; SSE3-NEXT: addl %eax, %r13d
+; SSE3-NEXT: movd %xmm1, %eax
+; SSE3-NEXT: pextrw $1, %xmm1, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; SSE3-NEXT: pextrw $2, %xmm1, %eax
+; SSE3-NEXT: pextrw $3, %xmm1, %r11d
+; SSE3-NEXT: addl %eax, %r11d
+; SSE3-NEXT: pextrw $4, %xmm1, %eax
+; SSE3-NEXT: pextrw $5, %xmm1, %r10d
+; SSE3-NEXT: addl %eax, %r10d
+; SSE3-NEXT: pextrw $6, %xmm1, %eax
+; SSE3-NEXT: pextrw $7, %xmm1, %r12d
+; SSE3-NEXT: addl %eax, %r12d
+; SSE3-NEXT: movd %xmm2, %eax
+; SSE3-NEXT: pextrw $1, %xmm2, %ebx
+; SSE3-NEXT: addl %eax, %ebx
+; SSE3-NEXT: pextrw $2, %xmm2, %eax
+; SSE3-NEXT: pextrw $3, %xmm2, %ecx
+; SSE3-NEXT: addl %eax, %ecx
+; SSE3-NEXT: pextrw $4, %xmm2, %esi
+; SSE3-NEXT: pextrw $5, %xmm2, %r8d
+; SSE3-NEXT: addl %esi, %r8d
+; SSE3-NEXT: pextrw $6, %xmm2, %esi
+; SSE3-NEXT: pextrw $7, %xmm2, %edx
+; SSE3-NEXT: addl %esi, %edx
+; SSE3-NEXT: movd %xmm3, %edi
+; SSE3-NEXT: pextrw $1, %xmm3, %r9d
+; SSE3-NEXT: addl %edi, %r9d
+; SSE3-NEXT: pextrw $2, %xmm3, %ebp
+; SSE3-NEXT: pextrw $3, %xmm3, %edi
+; SSE3-NEXT: addl %ebp, %edi
+; SSE3-NEXT: pextrw $4, %xmm3, %eax
+; SSE3-NEXT: pextrw $5, %xmm3, %ebp
+; SSE3-NEXT: addl %eax, %ebp
+; SSE3-NEXT: pextrw $6, %xmm3, %esi
+; SSE3-NEXT: pextrw $7, %xmm3, %eax
+; SSE3-NEXT: addl %esi, %eax
+; SSE3-NEXT: movd %edx, %xmm8
+; SSE3-NEXT: movd %r13d, %xmm3
+; SSE3-NEXT: movd %ecx, %xmm9
+; SSE3-NEXT: movd %r15d, %xmm4
+; SSE3-NEXT: movd %r8d, %xmm10
+; SSE3-NEXT: movd %r14d, %xmm7
+; SSE3-NEXT: movd %ebx, %xmm11
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT: movd %eax, %xmm12
+; SSE3-NEXT: movd %r12d, %xmm6
+; SSE3-NEXT: movd %edi, %xmm13
+; SSE3-NEXT: movd %r11d, %xmm5
+; SSE3-NEXT: movd %ebp, %xmm14
+; SSE3-NEXT: movd %r10d, %xmm2
+; SSE3-NEXT: movd %r9d, %xmm15
+; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
+; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: popq %r12
+; SSE3-NEXT: popq %r13
+; SSE3-NEXT: popq %r14
+; SSE3-NEXT: popq %r15
+; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: avx2_hadd_w:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm2, %xmm0
+; SSSE3-NEXT: phaddw %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: avx2_hadd_w:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avx2_hadd_w:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <16 x i16> %a, i32 0
%vecext1 = extractelement <16 x i16> %a, i32 1
%add = add i16 %vecext, %vecext1
@@ -790,13 +1467,3 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
ret <16 x i16> %vecinit108
}
-; CHECK-LABEL: avx2_hadd_w
-; SSE3-NOT: phaddw
-; SSSE3: phaddw
-; SSSE3-NEXT: phaddw
-; AVX: vphaddw
-; AVX: vphaddw
-; AVX2: vphaddw
-; AVX2-NOT: vphaddw
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
index dfe5fff72d07..5e2e50893d03 100644
--- a/test/CodeGen/X86/haddsub-undef.ll
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -1,10 +1,20 @@
-; RUN: llc < %s -march=x86-64 -mattr=ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -march=x86-64 -mattr=avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -march=x86-64 -mattr=avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test1_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test1_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -19,14 +29,17 @@ define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
%vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: test1_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -41,14 +54,17 @@ define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
ret <4 x float> %vecinit13
}
-; CHECK-LABEL: test2_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test3_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test3_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -63,38 +79,57 @@ define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
%vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
ret <4 x float> %vecinit9
}
-; CHECK-LABEL: test3_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test4_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test4_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
%vecinit = insertelement <4 x float> undef, float %add, i32 0
ret <4 x float> %vecinit
}
-; CHECK-LABEL: test4_undef
-; CHECK-NOT: haddps
-; CHECK: ret
-
define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test5_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm0, %xmm1
+; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: addsd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test5_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <2 x double> %a, i32 0
%vecext1 = extractelement <2 x double> %a, i32 1
%add = fadd double %vecext, %vecext1
%vecinit = insertelement <2 x double> undef, double %add, i32 0
ret <2 x double> %vecinit
}
-; CHECK-LABEL: test5_undef
-; CHECK-NOT: haddpd
-; CHECK: ret
-
define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test6_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test6_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -105,14 +140,17 @@ define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test6_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test7_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test7_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %b, i32 0
%vecext1 = extractelement <4 x float> %b, i32 1
%add = fadd float %vecext, %vecext1
@@ -123,14 +161,30 @@ define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test7_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NEXT: ret
-
define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test8_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: addss %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT: addss %xmm2, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test8_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -141,12 +195,17 @@ define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test8_undef
-; CHECK-NOT: haddps
-; CHECK: ret
-
define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test9_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test9_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
%vecext1 = extractelement <4 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -157,11 +216,17 @@ define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3
ret <4 x float> %vecinit5
}
-; CHECK-LABEL: test9_undef
-; CHECK: haddps
-; CHECK-NEXT: ret
define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test10_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test10_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -172,14 +237,21 @@ define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3
ret <8 x float> %vecinit5
}
-; CHECK-LABEL: test10_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test11_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE-NEXT: addss %xmm3, %xmm1
+; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test11_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -190,13 +262,17 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6
ret <8 x float> %vecinit5
}
-; CHECK-LABEL: test11_undef
-; SSE-NOT: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK: ret
define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test12_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test12_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
@@ -207,14 +283,18 @@ define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
ret <8 x float> %vecinit5
}
-; CHECK-LABEL: test12_undef
-; SSE: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: test13_undef:
+; SSE: # BB#0:
+; SSE-NEXT: haddps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test13_undef:
+; AVX: # BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add1 = fadd float %vecext, %vecext1
@@ -233,15 +313,22 @@ define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
%vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3
ret <8 x float> %vecinit4
}
-; CHECK-LABEL: test13_undef
-; SSE: haddps
-; SSE-NOT: haddps
-; AVX: vhaddps
-; AVX2: vhaddps
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test14_undef:
+; SSE: # BB#0:
+; SSE-NEXT: phaddd %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test14_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test14_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -252,17 +339,45 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 3
ret <8 x i32> %vecinit5
}
-; CHECK-LABEL: test14_undef
-; SSE: phaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; CHECK-NOT: phaddd
-; CHECK: ret
; On AVX2, the following sequence can be folded into a single horizontal add.
-; If the Subtarget doesn't support AVX2, then we avoid emitting two packed
+; If the Subtarget doesn't support AVX2, then we avoid emitting two packed
; integer horizontal adds instead of two scalar adds followed by vector inserts.
define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test15_undef:
+; SSE: # BB#0:
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE-NEXT: movd %xmm0, %ecx
+; SSE-NEXT: addl %eax, %ecx
+; SSE-NEXT: movd %xmm3, %eax
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: addl %eax, %edx
+; SSE-NEXT: movd %ecx, %xmm0
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test15_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
+; AVX1-NEXT: addl %eax, %ecx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; AVX1-NEXT: addl %eax, %edx
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test15_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -273,13 +388,22 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 6
ret <8 x i32> %vecinit5
}
-; CHECK-LABEL: test15_undef
-; SSE-NOT: phaddd
-; AVX-NOT: vphaddd
-; AVX2: vphaddd
-; CHECK: ret
define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test16_undef:
+; SSE: # BB#0:
+; SSE-NEXT: phaddd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test16_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test16_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
@@ -290,14 +414,24 @@ define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
ret <8 x i32> %vecinit5
}
-; CHECK-LABEL: test16_undef
-; SSE: phaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; CHECK-NOT: haddps
-; CHECK: ret
define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: test17_undef:
+; SSE: # BB#0:
+; SSE-NEXT: phaddd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test17_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test17_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add1 = add i32 %vecext, %vecext1
@@ -316,10 +450,3 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
%vecinit4 = insertelement <8 x i32> %vecinit3, i32 %add4, i32 3
ret <8 x i32> %vecinit4
}
-; CHECK-LABEL: test17_undef
-; SSE: phaddd
-; AVX: vphaddd
-; AVX2: vphaddd
-; CHECK-NOT: haddps
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll
index 6e65c6c739ca..8e28433d2ac2 100644
--- a/test/CodeGen/X86/haddsub.ll
+++ b/test/CodeGen/X86/haddsub.ll
@@ -1,293 +1,392 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,-avx | FileCheck %s -check-prefix=SSE3
-; RUN: llc < %s -march=x86-64 -mattr=-sse3,+avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: haddpd1:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddpd1:
-; AVX: vhaddpd
-define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
%b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
%r = fadd <2 x double> %a, %b
ret <2 x double> %r
}
+define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: haddpd2:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddpd2:
-; AVX: vhaddpd
-define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
%b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1>
%r = fadd <2 x double> %a, %b
ret <2 x double> %r
}
+define <2 x double> @haddpd3(<2 x double> %x) {
; SSE3-LABEL: haddpd3:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddpd3:
-; AVX: vhaddpd
-define <2 x double> @haddpd3(<2 x double> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%r = fadd <2 x double> %a, %b
ret <2 x double> %r
}
+define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: haddps1:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps1:
-; AVX: vhaddps
-define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: haddps2:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps2:
-; AVX: vhaddps
-define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
%b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps3(<4 x float> %x) {
; SSE3-LABEL: haddps3:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps3:
-; AVX: vhaddps
-define <4 x float> @haddps3(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps4(<4 x float> %x) {
; SSE3-LABEL: haddps4:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps4:
-; AVX: vhaddps
-define <4 x float> @haddps4(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps5(<4 x float> %x) {
; SSE3-LABEL: haddps5:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps5:
-; AVX: vhaddps
-define <4 x float> @haddps5(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps6(<4 x float> %x) {
; SSE3-LABEL: haddps6:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps6:
-; AVX: vhaddps
-define <4 x float> @haddps6(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @haddps7(<4 x float> %x) {
; SSE3-LABEL: haddps7:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: haddps7:
-; AVX: vhaddps
-define <4 x float> @haddps7(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
%r = fadd <4 x float> %a, %b
ret <4 x float> %r
}
+define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: hsubpd1:
-; SSE3-NOT: vhsubpd
-; SSE3: hsubpd
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubpd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubpd1:
-; AVX: vhsubpd
-define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
%b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3>
%r = fsub <2 x double> %a, %b
ret <2 x double> %r
}
+define <2 x double> @hsubpd2(<2 x double> %x) {
; SSE3-LABEL: hsubpd2:
-; SSE3-NOT: vhsubpd
-; SSE3: hsubpd
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubpd %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubpd2:
-; AVX: vhsubpd
-define <2 x double> @hsubpd2(<2 x double> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%r = fsub <2 x double> %a, %b
ret <2 x double> %r
}
+define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: hsubps1:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps1:
-; AVX: vhsubps
-define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @hsubps2(<4 x float> %x) {
; SSE3-LABEL: hsubps2:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps2:
-; AVX: vhsubps
-define <4 x float> @hsubps2(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @hsubps3(<4 x float> %x) {
; SSE3-LABEL: hsubps3:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps3:
-; AVX: vhsubps
-define <4 x float> @hsubps3(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <4 x float> @hsubps4(<4 x float> %x) {
; SSE3-LABEL: hsubps4:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
; AVX-LABEL: hsubps4:
-; AVX: vhsubps
-define <4 x float> @hsubps4(<4 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = fsub <4 x float> %a, %b
ret <4 x float> %r
}
+define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhaddps1:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm2, %xmm0
+; SSE3-NEXT: haddps %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddps1:
-; AVX: vhaddps
-define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = fadd <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhaddps2:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm2, %xmm0
+; SSE3-NEXT: haddps %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddps2:
-; AVX: vhaddps
-define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
%b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7>
%r = fadd <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhaddps3(<8 x float> %x) {
; SSE3-LABEL: vhaddps3:
-; SSE3-NOT: vhaddps
-; SSE3: haddps
-; SSE3: haddps
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: haddps %xmm1, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddps3:
-; AVX: vhaddps
-define <8 x float> @vhaddps3(<8 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = fadd <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhsubps1:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm2, %xmm0
+; SSE3-NEXT: hsubps %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhsubps1:
-; AVX: vhsubps
-define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
%r = fsub <8 x float> %a, %b
ret <8 x float> %r
}
+define <8 x float> @vhsubps3(<8 x float> %x) {
; SSE3-LABEL: vhsubps3:
-; SSE3-NOT: vhsubps
-; SSE3: hsubps
-; SSE3: hsubps
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubps %xmm0, %xmm0
+; SSE3-NEXT: hsubps %xmm1, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhsubps3:
-; AVX: vhsubps
-define <8 x float> @vhsubps3(<8 x float> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
%b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
%r = fsub <8 x float> %a, %b
ret <8 x float> %r
}
+define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
; SSE3-LABEL: vhaddpd1:
-; SSE3-NOT: vhaddpd
-; SSE3: haddpd
-; SSE3: haddpd
+; SSE3: # BB#0:
+; SSE3-NEXT: haddpd %xmm2, %xmm0
+; SSE3-NEXT: haddpd %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhaddpd1:
-; AVX: vhaddpd
-define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%r = fadd <4 x double> %a, %b
ret <4 x double> %r
}
+define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
; SSE3-LABEL: vhsubpd1:
-; SSE3-NOT: vhsubpd
-; SSE3: hsubpd
-; SSE3: hsubpd
+; SSE3: # BB#0:
+; SSE3-NEXT: hsubpd %xmm2, %xmm0
+; SSE3-NEXT: hsubpd %xmm3, %xmm1
+; SSE3-NEXT: retq
+;
; AVX-LABEL: vhsubpd1:
-; AVX: vhsubpd
-define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
%a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%r = fsub <4 x double> %a, %b
ret <4 x double> %r
}
-; CHECK-LABEL: haddps_v2f32
-; CHECK: haddps %xmm{{[0-9]+}}, %xmm0
-; CHECK-NEXT: retq
define <2 x float> @haddps_v2f32(<4 x float> %v0) {
+; SSE3-LABEL: haddps_v2f32:
+; SSE3: # BB#0:
+; SSE3-NEXT: haddps %xmm0, %xmm0
+; SSE3-NEXT: retq
+;
+; AVX-LABEL: haddps_v2f32:
+; AVX: # BB#0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%v0.0 = extractelement <4 x float> %v0, i32 0
%v0.1 = extractelement <4 x float> %v0, i32 1
%v0.2 = extractelement <4 x float> %v0, i32 2
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
index 3b2518e28f58..717ddbfa6fdc 100644
--- a/test/CodeGen/X86/half.ll
+++ b/test/CodeGen/X86/half.ll
@@ -1,12 +1,17 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false \
-; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWON
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c -asm-verbose=false -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL -check-prefix=BWOFF
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c -asm-verbose=false -fixup-byte-word-insts=1 \
+; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C -check-prefix=BWON
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr +sse2 -asm-verbose=false -fixup-byte-word-insts=0 \
+; RUN: | FileCheck %s -check-prefix=CHECK-I686
define void @test_load_store(half* %in, half* %out) {
; CHECK-LABEL: test_load_store:
-; CHECK: movw (%rdi), [[TMP:%[a-z0-9]+]]
-; CHECK: movw [[TMP]], (%rsi)
+; BWON: movzwl (%rdi), %eax
+; BWOFF: movw (%rdi), %ax
+; CHECK: movw %ax, (%rsi)
%val = load half, half* %in
store half %val, half* %out
ret void
@@ -14,7 +19,8 @@ define void @test_load_store(half* %in, half* %out) {
define i16 @test_bitcast_from_half(half* %addr) {
; CHECK-LABEL: test_bitcast_from_half:
-; CHECK: movzwl (%rdi), %eax
+; BWON: movzwl (%rdi), %eax
+; BWOFF: movw (%rdi), %ax
%val = load half, half* %addr
%val_int = bitcast half %val to i16
ret i16 %val_int
@@ -102,7 +108,7 @@ define void @test_sitofp_i64(i64 %a, half* %p) #0 {
; CHECK_LIBCALL-NEXT: retq
; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]]
-; CHECK-F16C-NEXT: vcvtps2ph $0, [[REG0]], [[REG0]]
+; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]]
; CHECK-F16C-NEXT: vmovd [[REG0]], %eax
; CHECK-F16C-NEXT: movw %ax, (%rsi)
; CHECK-F16C-NEXT: retq
@@ -175,7 +181,7 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]])
; CHECK-LIBCALL-NEXT: popq [[ADDR]]
-; CHECK-F16C-NEXT: vcvtps2ph $0, [[REG1]], [[REG4:%[a-z0-9]+]]
+; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG1]], [[REG4:%[a-z0-9]+]]
; CHECK-F16C-NEXT: vmovd [[REG4]], %eax
; CHECK-F16C-NEXT: movw %ax, (%rsi)
; CHECK-NEXT: retq
@@ -260,4 +266,51 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) {
ret void
}
+declare float @test_floatret();
+
+; On i686, if SSE2 is available, the return value from test_floatret is loaded
+; to f80 and then rounded to f32. The DAG combiner should not combine this
+; fp_round and the subsequent fptrunc from float to half.
+define half @test_f80trunc_nodagcombine() #0 {
+; CHECK-LABEL: test_f80trunc_nodagcombine:
+; CHECK-I686-NOT: calll __truncxfhf2
+ %1 = call float @test_floatret()
+ %2 = fptrunc float %1 to half
+ ret half %2
+}
+
+; CHECK-LABEL: test_sitofp_fadd_i32:
+
+; CHECK-LIBCALL-NEXT: pushq %rbx
+; CHECK-LIBCALL-NEXT: subq $16, %rsp
+; CHECK-LIBCALL-NEXT: movl %edi, %ebx
+; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp)
+; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
+; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
+; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
+; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
+; CHECK-LIBCALL-NEXT: addss 12(%rsp), %xmm0
+; CHECK-LIBCALL-NEXT: addq $16, %rsp
+; CHECK-LIBCALL-NEXT: popq %rbx
+; CHECK-LIBCALL-NEXT: retq
+
+; CHECK-F16C-NEXT: movswl (%rsi), %eax
+; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm1
+; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-F16C-NEXT: retq
+
+define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
+ %tmp0 = load half, half* %b
+ %tmp1 = sitofp i32 %a to half
+ %tmp2 = fadd half %tmp0, %tmp1
+ %tmp3 = fpext half %tmp2 to float
+ ret float %tmp3
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll
index e3808e754228..fbc4cd9d4f9c 100644
--- a/test/CodeGen/X86/hipe-cc.ll
+++ b/test/CodeGen/X86/hipe-cc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s
; Check the HiPE calling convention works (x86-32)
@@ -73,5 +73,23 @@ define cc 11 void @baz() nounwind {
ret void
}
+; Sanity-check the tail call sequence. Number of arguments was chosen as to
+; expose a bug where the tail call sequence clobbered the stack.
+define cc 11 { i32, i32, i32 } @tailcaller(i32 %hp, i32 %p) nounwind {
+ ; CHECK: movl $15, %eax
+ ; CHECK-NEXT: movl $31, %edx
+ ; CHECK-NEXT: movl $47, %ecx
+ ; CHECK-NEXT: popl %edi
+ ; CHECK-NEXT: jmp tailcallee
+ %ret = tail call cc11 { i32, i32, i32 } @tailcallee(i32 %hp, i32 %p, i32 15,
+ i32 31, i32 47, i32 63) nounwind
+ ret { i32, i32, i32 } %ret
+}
+
+!hipe.literals = !{ !0, !1, !2 }
+!0 = !{ !"P_NSP_LIMIT", i32 84 }
+!1 = !{ !"X86_LEAF_WORDS", i32 24 }
+!2 = !{ !"AMD64_LEAF_WORDS", i32 24 }
@clos = external constant i32
declare cc 11 void @bar(i32, i32, i32, i32, i32)
+declare cc 11 { i32, i32, i32 } @tailcallee(i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index 28d90399d857..43e2e1409fde 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s
; Check the HiPE calling convention works (x86-64)
@@ -83,5 +83,24 @@ define cc 11 void @baz() nounwind {
ret void
}
+; Sanity-check the tail call sequence. Number of arguments was chosen as to
+; expose a bug where the tail call sequence clobbered the stack.
+define cc 11 { i64, i64, i64 } @tailcaller(i64 %hp, i64 %p) #0 {
+ ; CHECK: movl $15, %esi
+ ; CHECK-NEXT: movl $31, %edx
+ ; CHECK-NEXT: movl $47, %ecx
+ ; CHECK-NEXT: movl $63, %r8d
+ ; CHECK-NEXT: popq %rax
+ ; CHECK-NEXT: jmp tailcallee
+ %ret = tail call cc11 { i64, i64, i64 } @tailcallee(i64 %hp, i64 %p, i64 15,
+ i64 31, i64 47, i64 63, i64 79) #1
+ ret { i64, i64, i64 } %ret
+}
+
+!hipe.literals = !{ !0, !1, !2 }
+!0 = !{ !"P_NSP_LIMIT", i32 160 }
+!1 = !{ !"X86_LEAF_WORDS", i32 24 }
+!2 = !{ !"AMD64_LEAF_WORDS", i32 24 }
@clos = external constant i64
declare cc 11 void @bar(i64, i64, i64, i64, i64, i64)
+declare cc 11 { i64, i64, i64 } @tailcallee(i64, i64, i64, i64, i64, i64, i64)
diff --git a/test/CodeGen/X86/hipe-prologue.ll b/test/CodeGen/X86/hipe-prologue.ll
index 2f16423600c9..8588dff9bc63 100644
--- a/test/CodeGen/X86/hipe-prologue.ll
+++ b/test/CodeGen/X86/hipe-prologue.ll
@@ -24,8 +24,8 @@ define {i32, i32} @test_basic(i32 %hp, i32 %p) {
define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) {
; X32-Linux-LABEL: test_basic_hipecc:
- ; X32-Linux: leal -156(%esp), %ebx
- ; X32-Linux-NEXT: cmpl 76(%ebp), %ebx
+ ; X32-Linux: leal -140(%esp), %ebx
+ ; X32-Linux-NEXT: cmpl 120(%ebp), %ebx
; X32-Linux-NEXT: jb .LBB1_1
; X32-Linux: ret
@@ -34,8 +34,8 @@ define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) {
; X32-Linux-NEXT: calll inc_stack_0
; X64-Linux-LABEL: test_basic_hipecc:
- ; X64-Linux: leaq -232(%rsp), %r14
- ; X64-Linux-NEXT: cmpq 144(%rbp), %r14
+ ; X64-Linux: leaq -184(%rsp), %r14
+ ; X64-Linux-NEXT: cmpq 120(%rbp), %r14
; X64-Linux-NEXT: jb .LBB1_1
; X64-Linux: ret
@@ -65,3 +65,8 @@ define cc 11 {i32,i32,i32} @test_nocall_hipecc(i32 %hp,i32 %p,i32 %x,i32 %y) {
%6 = insertvalue {i32, i32, i32} %5, i32 %p, 2
ret {i32, i32, i32} %6
}
+
+!hipe.literals = !{ !0, !1, !2 }
+!0 = !{ !"P_NSP_LIMIT", i32 120 }
+!1 = !{ !"X86_LEAF_WORDS", i32 24 }
+!2 = !{ !"AMD64_LEAF_WORDS", i32 18 }
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 6798c2b30c3b..5ade5b470b54 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,5 +1,10 @@
; REQUIRES: asserts
-; RUN: llc < %s -stats -O2 2>&1 | grep "1 machine-licm"
+; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machine-licm.*hoisted"
+; For test:
+; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_
+; and 1 for objc_msgSend from the GOT
+; For test_multi_def:
+; 2 invariant load (full multiply, both loads should be hoisted.)
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.2"
@@ -27,4 +32,32 @@ for.end: ; preds = %for.body
declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+define void @test_multi_def(i64* dereferenceable(8) %x1,
+ i64* dereferenceable(8) %x2,
+ i128* %y, i64 %count) nounwind {
+entry:
+ br label %for.body
+
+for.check:
+ %inc = add nsw i64 %i, 1
+ %done = icmp sge i64 %inc, %count
+ br i1 %done, label %exit, label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %inc, %for.check ]
+ %x1_load = load i64, i64* %x1, align 8, !invariant.load !0
+ %x1_zext = zext i64 %x1_load to i128
+ %x2_load = load i64, i64* %x2, align 8, !invariant.load !0
+ %x2_zext = zext i64 %x2_load to i128
+ %x_prod = mul i128 %x1_zext, %x2_zext
+ %y_elem = getelementptr inbounds i128, i128* %y, i64 %i
+ %y_load = load i128, i128* %y_elem, align 8
+ %y_plus = add i128 %x_prod, %y_load
+ store i128 %y_plus, i128* %y_elem, align 8
+ br label %for.check
+
+exit:
+ ret void
+}
+
!0 = !{}
diff --git a/test/CodeGen/X86/hoist-spill-lpad.ll b/test/CodeGen/X86/hoist-spill-lpad.ll
new file mode 100644
index 000000000000..3171f6f9f6fd
--- /dev/null
+++ b/test/CodeGen/X86/hoist-spill-lpad.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+;
+; PR27612. The following spill is hoisted from two locations: the fall
+; through succ block and the landingpad block of a call which may throw
+; exception. If it is not hoisted before the call, the spill will be
+; missing on the landingpad path.
+;
+; CHECK-LABEL: _Z3foov:
+; CHECK: movq %rbx, (%rsp) # 8-byte Spill
+; CHECK-NEXT: callq _Z3goov
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global [20 x i64] zeroinitializer, align 16
+@_ZTIi = external constant i8*
+
+; Function Attrs: uwtable
+define void @_Z3foov() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+ %tmp = load i64, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 1), align 8
+ invoke void @_Z3goov()
+ to label %try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %tmp1 = landingpad { i8*, i32 }
+ cleanup
+ catch i8* bitcast (i8** @_ZTIi to i8*)
+ %tmp2 = extractvalue { i8*, i32 } %tmp1, 1
+ %tmp3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+ %matches = icmp eq i32 %tmp2, %tmp3
+ br i1 %matches, label %catch, label %ehcleanup
+
+catch: ; preds = %lpad
+ %tmp4 = extractvalue { i8*, i32 } %tmp1, 0
+ %tmp5 = tail call i8* @__cxa_begin_catch(i8* %tmp4)
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 2), align 16
+ tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{memory},~{dirflag},~{fpsr},~{flags}"()
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 3), align 8
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont: ; preds = %catch, %entry
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 4), align 16
+ tail call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{memory},~{dirflag},~{fpsr},~{flags}"()
+ store i64 %tmp, i64* getelementptr inbounds ([20 x i64], [20 x i64]* @a, i64 0, i64 5), align 8
+ ret void
+
+ehcleanup: ; preds = %lpad
+ resume { i8*, i32 } %tmp1
+}
+
+declare void @_Z3goov()
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
diff --git a/test/CodeGen/X86/hoist-spill.ll b/test/CodeGen/X86/hoist-spill.ll
new file mode 100644
index 000000000000..db9c4105a020
--- /dev/null
+++ b/test/CodeGen/X86/hoist-spill.ll
@@ -0,0 +1,121 @@
+; RUN: llc < %s | FileCheck %s
+
+; grep 'Spill' |sed 's%.*\(-[0-9]\+(\%rsp)\).*%\1%g' |sort |uniq -d |awk '{if (/rsp/); exit -1}'
+; Check no spills to the same stack slot after hoisting.
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
+; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
+; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp)
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global i32*, align 8
+@b = external global i32, align 4
+@d = external global i32*, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @fn1(i32 %p1) {
+entry:
+ %tmp = load i32*, i32** @d, align 8
+ %tmp1 = load i32*, i32** @a, align 8
+ %tmp2 = sext i32 %p1 to i64
+ br label %for.cond
+
+for.cond: ; preds = %for.inc14, %entry
+ %indvar = phi i32 [ %indvar.next, %for.inc14 ], [ 0, %entry ]
+ %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ]
+ %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ]
+ %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ]
+ %tmp3 = icmp sgt i32 undef, 0
+ %smax52 = select i1 %tmp3, i32 undef, i32 0
+ %tmp4 = zext i32 %smax52 to i64
+ %tmp5 = icmp sgt i64 undef, %tmp4
+ %smax53 = select i1 %tmp5, i64 undef, i64 %tmp4
+ %tmp6 = add nsw i64 %smax53, 1
+ %tmp7 = sub nsw i64 %tmp6, %tmp4
+ %tmp8 = add nsw i64 %tmp7, -8
+ %tmp9 = sub i32 undef, %indvar
+ %tmp10 = icmp sgt i64 %tmp2, 0
+ %smax40 = select i1 %tmp10, i64 %tmp2, i64 0
+ %scevgep41 = getelementptr i32, i32* %tmp1, i64 %smax40
+ %indvars.iv30 = add i32 %indvars.iv30.in, -1
+ %tmp11 = icmp sgt i32 %indvars.iv30, 0
+ %smax = select i1 %tmp11, i32 %indvars.iv30, i32 0
+ %tmp12 = zext i32 %smax to i64
+ %sub = sub nsw i32 %p1, %c.0
+ %cmp = icmp sgt i32 %sub, 0
+ %sub. = select i1 %cmp, i32 %sub, i32 0
+ %cmp326 = icmp sgt i32 %k.0, %p1
+ br i1 %cmp326, label %for.cond4.preheader, label %for.body.preheader
+
+for.body.preheader: ; preds = %for.cond
+ br label %for.body
+
+for.cond4.preheader: ; preds = %for.body, %for.cond
+ %k.1.lcssa = phi i32 [ %k.0, %for.cond ], [ %add, %for.body ]
+ %cmp528 = icmp sgt i32 %sub., %p1
+ br i1 %cmp528, label %for.inc14, label %for.body6.preheader
+
+for.body6.preheader: ; preds = %for.cond4.preheader
+ br i1 undef, label %for.body6, label %min.iters.checked
+
+min.iters.checked: ; preds = %for.body6.preheader
+ br i1 undef, label %for.body6, label %vector.memcheck
+
+vector.memcheck: ; preds = %min.iters.checked
+ %bound1 = icmp ule i32* undef, %scevgep41
+ %memcheck.conflict = and i1 undef, %bound1
+ br i1 %memcheck.conflict, label %for.body6, label %vector.body.preheader
+
+vector.body.preheader: ; preds = %vector.memcheck
+ %lcmp.mod = icmp eq i64 undef, 0
+ br i1 %lcmp.mod, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.prol: ; preds = %vector.body.prol, %vector.body.preheader
+ %prol.iter.cmp = icmp eq i64 undef, 0
+ br i1 %prol.iter.cmp, label %vector.body.preheader.split, label %vector.body.prol
+
+vector.body.preheader.split: ; preds = %vector.body.prol, %vector.body.preheader
+ %tmp13 = icmp ult i64 %tmp8, 24
+ br i1 %tmp13, label %middle.block, label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.body.preheader.split
+ %index = phi i64 [ %index.next.3, %vector.body ], [ 0, %vector.body.preheader.split ]
+ %index.next = add i64 %index, 8
+ %offset.idx.1 = add i64 %tmp12, %index.next
+ %tmp14 = getelementptr inbounds i32, i32* %tmp, i64 %offset.idx.1
+ %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+ %wide.load.1 = load <4 x i32>, <4 x i32>* %tmp15, align 4
+ %tmp16 = getelementptr inbounds i32, i32* %tmp1, i64 %offset.idx.1
+ %tmp17 = bitcast i32* %tmp16 to <4 x i32>*
+ store <4 x i32> %wide.load.1, <4 x i32>* %tmp17, align 4
+ %index.next.3 = add i64 %index, 32
+ br i1 undef, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body, %vector.body.preheader.split
+ br i1 undef, label %for.inc14, label %for.body6
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %k.127 = phi i32 [ %k.0, %for.body.preheader ], [ %add, %for.body ]
+ %add = add nsw i32 %k.127, 1
+ %tmp18 = load i32, i32* undef, align 4
+ store i32 %tmp18, i32* @b, align 4
+ br i1 undef, label %for.body, label %for.cond4.preheader
+
+for.body6: ; preds = %for.body6, %middle.block, %vector.memcheck, %min.iters.checked, %for.body6.preheader
+ %indvars.iv32 = phi i64 [ undef, %for.body6 ], [ %tmp12, %vector.memcheck ], [ %tmp12, %min.iters.checked ], [ %tmp12, %for.body6.preheader ], [ undef, %middle.block ]
+ %arrayidx8 = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv32
+ %tmp19 = load i32, i32* %arrayidx8, align 4
+ %arrayidx10 = getelementptr inbounds i32, i32* %tmp1, i64 %indvars.iv32
+ store i32 %tmp19, i32* %arrayidx10, align 4
+ %cmp5 = icmp slt i64 %indvars.iv32, undef
+ br i1 %cmp5, label %for.body6, label %for.inc14
+
+for.inc14: ; preds = %for.body6, %middle.block, %for.cond4.preheader
+ %inc15 = add nuw nsw i32 %c.0, 1
+ %indvar.next = add i32 %indvar, 1
+ br label %for.cond
+}
diff --git a/test/CodeGen/X86/i16lshr8pat.ll b/test/CodeGen/X86/i16lshr8pat.ll
new file mode 100644
index 000000000000..7f2da8e29538
--- /dev/null
+++ b/test/CodeGen/X86/i16lshr8pat.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=x86 -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; This test checks to make sure the lshr in %then1 block gets expanded using
+; GR16_ABCD pattern rather than GR32_ABCD pattern. By using the 16-bit pattern
+; this doesn't make the register liveness information look like the whole
+; 32-bit register is a live value, and allows generally better live register
+; analysis.
+; CHECK-LABEL: bb.1.then1:
+; CHECK-NOT: IMPLICIT_DEF
+; CHECK-NOT: INSERT_SUBREG
+; CHECK: sub_8bit_hi
+; CHECK-LABEL: bb.2.endif1:
+
+define i16 @foo4(i32 %prec, i8 *%dst, i16 *%src) {
+entry:
+ %cnd = icmp ne i32 %prec, 0
+ %t0 = load i16, i16 *%src, align 2
+ br i1 %cnd, label %then1, label %endif1
+
+then1:
+ %shr = lshr i16 %t0, 8
+ %conv = trunc i16 %shr to i8
+ store i8 %conv, i8 *%dst, align 1
+ br label %endif1
+
+endif1:
+ %t2 = phi i16 [0, %then1], [%t0, %entry]
+ ret i16 %t2
+}
diff --git a/test/CodeGen/X86/i386-setjmp-pic.ll b/test/CodeGen/X86/i386-setjmp-pic.ll
new file mode 100644
index 000000000000..43a8a0ec76cb
--- /dev/null
+++ b/test/CodeGen/X86/i386-setjmp-pic.ll
@@ -0,0 +1,23 @@
+; RUN: llc -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx"
+
+; Check that the register used as base pointer for setjmp
+; is properly initialized.
+; The test used to fail with the machine verifier complaining
+; that the global base pointer is not initialized.
+; PR26742.
+;
+; CHECK: test:
+; CHECK: calll [[BP_SETUP_LABEL:L[$0-9a-zA-Z_-]+]]
+; CHECK: [[BP_SETUP_LABEL]]:
+; CHECK-NEXT: popl [[BP:%[a-z]+]]
+;
+; CHECK: leal [[BLOCK_ADDR:LBB[$0-9a-zA-Z_-]+]]-[[BP_SETUP_LABEL]]([[BP]]),
+define i32 @test(i8* %tmp) {
+entry:
+ %tmp9 = call i32 @llvm.eh.sjlj.setjmp(i8* %tmp)
+ ret i32 %tmp9
+}
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*)
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
index 748c397143c5..2c3e384b70a6 100644
--- a/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -1,7 +1,7 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: llc %s -o - -enable-shrink-wrap=true -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -no-x86-call-frame-opt | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386-apple-macosx"
+target triple = "i386-apple-macosx10.5"
@a = common global i32 0, align 4
@d = internal unnamed_addr global i1 false
@@ -64,7 +64,7 @@ target triple = "i386-apple-macosx"
; CHECK-NEXT: cmovnel {{%[a-z]+}}, [[CONV]]
;
; Skip all the crust of vaarg lowering.
-; CHECK: calll L_varfunc$stub
+; CHECK: calll _varfunc
; Set the return value to 0.
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: addl $20, %esp
diff --git a/test/CodeGen/X86/i386-tlscall-fastregalloc.ll b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
index 775c0c1b3784..86f6f5872d0f 100644
--- a/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
+++ b/test/CodeGen/X86/i386-tlscall-fastregalloc.ll
@@ -10,15 +10,20 @@ target triple = "i386-apple-macosx10.10"
; PR26485.
;
; CHECK-LABEL: f:
+; Get c.
+; C is spilled because of the scheduling of the instructions,
+; but a smarter regalloc wouldn't have spilled it.
+; CHECK: movl L_c{{[^,]*}}, [[C_ADDR:%[a-z]+]]
+; CHECK-NEXT: movl [[C_ADDR]], [[C_SPILLED:[0-8]+\(%esp\)]]
; Get p.
-; CHECK: movl _p@{{[0-9a-zA-Z]+}}, [[P_ADDR:%[a-z]+]]
+; CHECK-NEXT: movl _p@{{[0-9a-zA-Z]+}}, [[P_ADDR:%[a-z]+]]
; CHECK-NEXT: calll *([[P_ADDR]])
; At this point eax contiains the address of p.
; Load c address.
; Make sure we do not clobber eax.
-; CHECK-NEXT: movl L_c{{[^,]*}}, [[C_ADDR:%e[b-z]x+]]
+; CHECK-NEXT: movl [[C_SPILLED]], [[C_ADDR_RELOADED:%e[b-z]x+]]
; Store c address into p.
-; CHECK-NEXT: movl [[C_ADDR]], (%eax)
+; CHECK-NEXT: movl [[C_ADDR_RELOADED]], (%eax)
define void @f() #0 {
entry:
store i8* @c, i8** @p, align 4
diff --git a/test/CodeGen/X86/i686-win-shrink-wrapping.ll b/test/CodeGen/X86/i686-win-shrink-wrapping.ll
new file mode 100644
index 000000000000..1a2cb8476623
--- /dev/null
+++ b/test/CodeGen/X86/i686-win-shrink-wrapping.ll
@@ -0,0 +1,44 @@
+; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+%struct.S = type { i32 }
+
+; Check that we do not use a basic block that has EFLAGS as live-in
+; if we need to realign the stack.
+; PR27531.
+; CHECK-LABEL: stackRealignment:
+; Prologue code.
+; CHECK: pushl
+; Make sure we actually perform some stack realignment.
+; CHECK: andl ${{[-0-9]+}}, %esp
+; This is the end of the entry block.
+; The prologue should have happened before that point because past
+; this point, EFLAGS is live.
+; CHECK: jg
+define x86_thiscallcc void @stackRealignment(%struct.S* %this) {
+entry:
+ %data = alloca [1 x i32], align 4
+ %d = alloca double, align 8
+ %tmp = bitcast [1 x i32]* %data to i8*
+ %arrayinit.begin = getelementptr inbounds [1 x i32], [1 x i32]* %data, i32 0, i32 0
+ %x_ = getelementptr inbounds %struct.S, %struct.S* %this, i32 0, i32 0
+ %tmp1 = load i32, i32* %x_, align 4
+ %cmp = icmp sgt i32 %tmp1, 32
+ %cond = select i1 %cmp, i32 42, i32 128
+ store i32 %cond, i32* %arrayinit.begin, align 4
+ %cmp3 = icmp slt i32 %tmp1, 32
+ br i1 %cmp3, label %cleanup, label %if.end
+
+if.end: ; preds = %entry
+ %tmp2 = bitcast double* %d to i8*
+ call x86_thiscallcc void @bar(%struct.S* nonnull %this, i32* %arrayinit.begin, double* nonnull %d)
+ br label %cleanup
+
+cleanup: ; preds = %if.end, %entry
+ ret void
+}
+
+; Function Attrs: optsize
+declare x86_thiscallcc void @bar(%struct.S*, i32*, double*)
diff --git a/test/CodeGen/X86/ifunc-asm.ll b/test/CodeGen/X86/ifunc-asm.ll
new file mode 100644
index 000000000000..b65ba86a4f1a
--- /dev/null
+++ b/test/CodeGen/X86/ifunc-asm.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-as < %s -o - | llc -filetype=asm | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal i64 @foo_ifunc() {
+entry:
+ ret i64 0
+}
+; CHECK: .type foo_ifunc,@function
+; CHECK-NEXT: foo_ifunc:
+
+@foo = ifunc i32 (i32), i64 ()* @foo_ifunc
+; CHECK: .type foo,@function
+; CHECK-NEXT: .type foo,@gnu_indirect_function
+; CHECK-NEXT: foo = foo_ifunc
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index 8b905f5d23b6..9a8a3a4369d3 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-implicit-null-checks \
; RUN: | llvm-mc -triple x86_64-apple-macosx -filetype=obj -o - \
@@ -12,10 +12,10 @@
define i32 @imp_null_check_load(i32* %x) {
; CHECK-LABEL: _imp_null_check_load:
-; CHECK: Ltmp1:
+; CHECK: [[BB0_imp_null_check_load:L[^:]+]]:
; CHECK: movl (%rdi), %eax
; CHECK: retq
-; CHECK: Ltmp0:
+; CHECK: [[BB1_imp_null_check_load:LBB0_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -33,10 +33,10 @@ define i32 @imp_null_check_load(i32* %x) {
define i32 @imp_null_check_gep_load(i32* %x) {
; CHECK-LABEL: _imp_null_check_gep_load:
-; CHECK: Ltmp3:
+; CHECK: [[BB0_imp_null_check_gep_load:L[^:]+]]:
; CHECK: movl 128(%rdi), %eax
; CHECK: retq
-; CHECK: Ltmp2:
+; CHECK: [[BB1_imp_null_check_gep_load:LBB1_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -55,11 +55,11 @@ define i32 @imp_null_check_gep_load(i32* %x) {
define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
; CHECK-LABEL: _imp_null_check_add_result:
-; CHECK: Ltmp5:
+; CHECK: [[BB0_imp_null_check_add_result:L[^:]+]]:
; CHECK: addl (%rdi), %esi
; CHECK: movl %esi, %eax
; CHECK: retq
-; CHECK: Ltmp4:
+; CHECK: [[BB1_imp_null_check_add_result:LBB2_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -78,12 +78,12 @@ define i32 @imp_null_check_add_result(i32* %x, i32 %p) {
define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z) {
; CHECK-LABEL: _imp_null_check_hoist_over_unrelated_load:
-; CHECK: Ltmp7:
+; CHECK: [[BB0_imp_null_check_hoist_over_unrelated_load:L[^:]+]]:
; CHECK: movl (%rdi), %eax
; CHECK: movl (%rsi), %ecx
; CHECK: movl %ecx, (%rdx)
; CHECK: retq
-; CHECK: Ltmp6:
+; CHECK: [[BB1_imp_null_check_hoist_over_unrelated_load:LBB3_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
@@ -103,12 +103,12 @@ define i32 @imp_null_check_hoist_over_unrelated_load(i32* %x, i32* %y, i32* %z)
define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; CHECK-LABEL: _imp_null_check_via_mem_comparision
-; CHECK: Ltmp9:
+; CHECK: [[BB0_imp_null_check_via_mem_comparision:L[^:]+]]:
; CHECK: cmpl %esi, 4(%rdi)
; CHECK: jge LBB4_2
; CHECK: movl $100, %eax
; CHECK: retq
-; CHECK: Ltmp8:
+; CHECK: [[BB1_imp_null_check_via_mem_comparision:LBB4_[0-9]+]]:
; CHECK: movl $42, %eax
; CHECK: retq
; CHECK: LBB4_2:
@@ -158,9 +158,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp5-_imp_null_check_add_result
+; CHECK-NEXT: .long [[BB0_imp_null_check_add_result]]-_imp_null_check_add_result
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp4-_imp_null_check_add_result
+; CHECK-NEXT: .long [[BB1_imp_null_check_add_result]]-_imp_null_check_add_result
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_gep_load
@@ -171,9 +171,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp3-_imp_null_check_gep_load
+; CHECK-NEXT: .long [[BB0_imp_null_check_gep_load]]-_imp_null_check_gep_load
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp2-_imp_null_check_gep_load
+; CHECK-NEXT: .long [[BB1_imp_null_check_gep_load]]-_imp_null_check_gep_load
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_hoist_over_unrelated_load
@@ -184,9 +184,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp7-_imp_null_check_hoist_over_unrelated_load
+; CHECK-NEXT: .long [[BB0_imp_null_check_hoist_over_unrelated_load]]-_imp_null_check_hoist_over_unrelated_load
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp6-_imp_null_check_hoist_over_unrelated_load
+; CHECK-NEXT: .long [[BB1_imp_null_check_hoist_over_unrelated_load]]-_imp_null_check_hoist_over_unrelated_load
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_load
@@ -197,9 +197,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp1-_imp_null_check_load
+; CHECK-NEXT: .long [[BB0_imp_null_check_load]]-_imp_null_check_load
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp0-_imp_null_check_load
+; CHECK-NEXT: .long [[BB1_imp_null_check_load]]-_imp_null_check_load
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_via_mem_comparision
@@ -210,9 +210,9 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
; Fault[0].Type:
; CHECK-NEXT: .long 1
; Fault[0].FaultOffset:
-; CHECK-NEXT: .long Ltmp9-_imp_null_check_via_mem_comparision
+; CHECK-NEXT: .long [[BB0_imp_null_check_via_mem_comparision]]-_imp_null_check_via_mem_comparision
; Fault[0].HandlerOffset:
-; CHECK-NEXT: .long Ltmp8-_imp_null_check_via_mem_comparision
+; CHECK-NEXT: .long [[BB1_imp_null_check_via_mem_comparision]]-_imp_null_check_via_mem_comparision
; OBJDUMP: FaultMap table:
; OBJDUMP-NEXT: Version: 0x1
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
new file mode 100644
index 000000000000..9e83964247e7
--- /dev/null
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -0,0 +1,266 @@
+# RUN: llc -run-pass implicit-null-checks -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
+
+--- |
+ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-apple-macosx"
+
+ ;; Positive test
+ define i32 @imp_null_check_with_bitwise_op_0(i32* %x, i32 %val) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ ;; Negative test. The regalloc is such that we cannot hoist the
+ ;; instruction materializing 2200000 into %eax
+ define i32 @imp_null_check_with_bitwise_op_1(i32* %x, i32 %val, i32* %ptr) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 undef
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ ;; Negative test: IR is identical to
+ ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
+ define i32 @imp_null_check_with_bitwise_op_2(i32* %x, i32 %val) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ ;; Negative test: IR is identical to
+ ;; @imp_null_check_with_bitwise_op_0 but MIR differs.
+ define i32 @imp_null_check_with_bitwise_op_3(i32* %x, i32 %val) {
+ entry:
+ br i1 undef, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ br i1 undef, label %ret_100, label %ret_200
+
+ ret_100:
+ ret i32 100
+
+ ret_200:
+ ret i32 200
+ }
+
+ !0 = !{}
+...
+---
+name: imp_null_check_with_bitwise_op_0
+# CHECK-LABEL: name: imp_null_check_with_bitwise_op_0
+alignment: 4
+allVRegsAllocated: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%esi' }
+# CHECK: bb.0.entry:
+# CHECK: %eax = MOV32ri 2200000
+# CHECK-NEXT: %eax = FAULTING_LOAD_OP %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %esi, %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %esi, %rdi
+
+ %eax = MOV32ri 2200000
+ %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP32rr killed %eax, killed %esi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ %eax = MOV32ri 200
+ RET 0, %eax
+
+ bb.3.is_null:
+ %eax = MOV32ri 42
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
+---
+name: imp_null_check_with_bitwise_op_1
+alignment: 4
+allVRegsAllocated: true
+isSSA: false
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%esi' }
+ - { reg: '%rdx' }
+# CHECK: bb.0.entry:
+# CHECK: %eax = MOV32rm killed %rdx, 1, _, 0, _ :: (volatile load 4 from %ir.ptr)
+# CHECK-NEXT: TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %esi, %rdi, %rdx
+
+ %eax = MOV32rm killed %rdx, 1, _, 0, _ :: (volatile load 4 from %ir.ptr)
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %esi, %rdi
+
+ %eax = MOV32ri 2200000
+ %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP32rr killed %eax, killed %esi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ successors: %bb.3.is_null
+
+ %eax = MOV32ri 200
+
+ bb.3.is_null:
+ liveins: %eax, %ah, %al, %ax, %bh, %bl, %bp, %bpl, %bx, %eax, %ebp, %ebx, %rax, %rbp, %rbx, %r12, %r13, %r14, %r15, %r12b, %r13b, %r14b, %r15b, %r12d, %r13d, %r14d, %r15d, %r12w, %r13w, %r14w, %r15w
+
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
+---
+name: imp_null_check_with_bitwise_op_2
+# CHECK-LABEL: name: imp_null_check_with_bitwise_op_2
+alignment: 4
+allVRegsAllocated: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%esi' }
+# CHECK: bb.0.entry:
+# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %esi, %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %esi, %rdi
+
+ %eax = MOV32ri 2200000
+ %eax = ADD32ri killed %eax, 100, implicit-def dead %eflags
+ %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP32rr killed %eax, killed %esi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ %eax = MOV32ri 200
+ RET 0, %eax
+
+ bb.3.is_null:
+ %eax = MOV32ri 42
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
+---
+name: imp_null_check_with_bitwise_op_3
+# CHECK-LABEL: name: imp_null_check_with_bitwise_op_3
+alignment: 4
+allVRegsAllocated: true
+tracksRegLiveness: true
+tracksSubRegLiveness: false
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%rsi' }
+# CHECK: bb.0.entry:
+# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+
+body: |
+ bb.0.entry:
+ successors: %bb.3.is_null, %bb.1.not_null
+ liveins: %rsi, %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.3.is_null, implicit %eflags
+
+ bb.1.not_null:
+ successors: %bb.4.ret_100, %bb.2.ret_200
+ liveins: %rsi, %rdi
+
+ %rdi = MOV64ri 5000
+ %rdi = AND64rm killed %rdi, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ CMP64rr killed %rdi, killed %rsi, implicit-def %eflags
+ JE_1 %bb.4.ret_100, implicit %eflags
+
+ bb.2.ret_200:
+ %eax = MOV32ri 200
+ RET 0, %eax
+
+ bb.3.is_null:
+ %eax = MOV32ri 42
+ RET 0, %eax
+
+ bb.4.ret_100:
+ %eax = MOV32ri 100
+ RET 0, %eax
+
+...
diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll
index eba4e72f9330..f13d537d90b8 100644
--- a/test/CodeGen/X86/inalloca-ctor.ll
+++ b/test/CodeGen/X86/inalloca-ctor.ll
@@ -12,8 +12,8 @@ define void @g() {
entry:
%args = alloca inalloca %frame
%c = getelementptr %frame, %frame* %args, i32 0, i32 2
-; CHECK: movl $20, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: subl $16, %esp
; CHECK: movl %esp,
call void @Foo_ctor(%Foo* %c)
; CHECK: leal 12(%{{.*}}),
diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll
index 9a184e563b19..d90e5012ba45 100644
--- a/test/CodeGen/X86/inalloca-invoke.ll
+++ b/test/CodeGen/X86/inalloca-invoke.ll
@@ -21,7 +21,8 @@ blah:
%beg = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 0
%end = getelementptr %frame.reverse, %frame.reverse* %rev_args, i32 0, i32 1
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: subl $20, %esp
; CHECK: movl %esp, %[[beg:[^ ]*]]
; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
index 4f7e4092a99c..69d94d8bfa74 100644
--- a/test/CodeGen/X86/inalloca-stdcall.ll
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -8,8 +8,8 @@ declare x86_stdcallcc void @i(i32 %a)
define void @g() {
; CHECK-LABEL: _g:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
diff --git a/test/CodeGen/X86/inalloca.ll b/test/CodeGen/X86/inalloca.ll
index e523c945a69f..134de2f58dda 100644
--- a/test/CodeGen/X86/inalloca.ll
+++ b/test/CodeGen/X86/inalloca.ll
@@ -8,8 +8,8 @@ define void @a() {
; CHECK-LABEL: _a:
entry:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
@@ -28,8 +28,8 @@ define void @b() {
; CHECK-LABEL: _b:
entry:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
@@ -49,8 +49,8 @@ define void @c() {
; CHECK-LABEL: _c:
entry:
%b = alloca inalloca %Foo
-; CHECK: movl $8, %eax
-; CHECK: calll __chkstk
+; CHECK: pushl %eax
+; CHECK: pushl %eax
%f1 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
%f2 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
store i32 13, i32* %f1
diff --git a/test/CodeGen/X86/indirect-hidden.ll b/test/CodeGen/X86/indirect-hidden.ll
index 9e1b7d373554..5f3885d00e5f 100644
--- a/test/CodeGen/X86/indirect-hidden.ll
+++ b/test/CodeGen/X86/indirect-hidden.ll
@@ -35,9 +35,9 @@ declare i32 @__gxx_personality_v0(...)
; CHECK: .section __IMPORT,__pointers,non_lazy_symbol_pointers
; CHECK-NOT: __DATA,__data
-; CHECK: .indirect_symbol _normal_typeid
+; CHECK: .indirect_symbol _hidden_typeid
; CHECK-NEXT: .long 0
; CHECK-NOT: __DATA,__data
-; CHECK: .indirect_symbol _hidden_typeid
+; CHECK: .indirect_symbol _normal_typeid
; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll
index 4e582de22a1f..65c1c0957adf 100644
--- a/test/CodeGen/X86/insertelement-zero.ll
+++ b/test/CodeGen/X86/insertelement-zero.ll
@@ -10,37 +10,72 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
-; SSE-LABEL: insert_v2f64_z1:
-; SSE: # BB#0:
-; SSE-NEXT: xorpd %xmm1, %xmm1
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v2f64_z1:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorpd %xmm1, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v2f64_z1:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorpd %xmm1, %xmm1
+; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v2f64_z1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorpd %xmm1, %xmm1
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v2f64_z1:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorpd %xmm1, %xmm1
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2f64_z1:
; AVX: # BB#0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = insertelement <2 x double> %a, double 0.0, i32 0
ret <2 x double> %1
}
define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
-; SSE-LABEL: insert_v4f64_0zz3:
-; SSE: # BB#0:
-; SSE-NEXT: xorpd %xmm2, %xmm2
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v4f64_0zz3:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorpd %xmm2, %xmm2
+; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v4f64_0zz3:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorpd %xmm2, %xmm2
+; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v4f64_0zz3:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorpd %xmm2, %xmm2
+; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v4f64_0zz3:
+; SSE41: # BB#0:
+; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE41-NEXT: xorpd %xmm2, %xmm2
+; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f64_0zz3:
; AVX: # BB#0:
-; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; AVX-NEXT: retq
%1 = insertelement <4 x double> %a, double 0.0, i32 1
%2 = insertelement <4 x double> %1, double 0.0, i32 2
@@ -68,15 +103,21 @@ define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
;
; SSE41-LABEL: insert_v2i64_z1:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrq $0, %rax, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_v2i64_z1:
-; AVX: # BB#0:
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_v2i64_z1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v2i64_z1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
%1 = insertelement <2 x i64> %a, i64 0, i32 0
ret <2 x i64> %1
}
@@ -102,24 +143,20 @@ define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
;
; SSE41-LABEL: insert_v4i64_01z3:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrq $0, %rax, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v4i64_01z3:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v4i64_01z3:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
%1 = insertelement <4 x i64> %a, i64 0, i32 2
ret <4 x i64> %1
@@ -150,13 +187,13 @@ define <4 x float> @insert_v4f32_01z3(<4 x float> %a) {
; SSE41-LABEL: insert_v4f32_01z3:
; SSE41: # BB#0:
; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f32_01z3:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX-NEXT: retq
%1 = insertelement <4 x float> %a, float 0.0, i32 2
ret <4 x float> %1
@@ -191,16 +228,13 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
; SSE41: # BB#0:
; SSE41-NEXT: xorps %xmm2, %xmm2
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v8f32_z12345z7:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
; AVX-NEXT: retq
%1 = insertelement <8 x float> %a, float 0.0, i32 0
%2 = insertelement <8 x float> %1, float 0.0, i32 6
@@ -234,15 +268,21 @@ define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) {
;
; SSE41-LABEL: insert_v4i32_01z3:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrd $2, %eax, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_v4i32_01z3:
-; AVX: # BB#0:
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_v4i32_01z3:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v4i32_01z3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT: retq
%1 = insertelement <4 x i32> %a, i32 0, i32 2
ret <4 x i32> %1
}
@@ -280,29 +320,21 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
;
; SSE41-LABEL: insert_v8i32_z12345z7:
; SSE41: # BB#0:
-; SSE41-NEXT: xorl %eax, %eax
-; SSE41-NEXT: pinsrd $0, %eax, %xmm0
-; SSE41-NEXT: pinsrd $2, %eax, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v8i32_z12345z7:
; AVX1: # BB#0:
-; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v8i32_z12345z7:
; AVX2: # BB#0:
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
; AVX2-NEXT: retq
%1 = insertelement <8 x i32> %a, i32 0, i32 0
%2 = insertelement <8 x i32> %1, i32 0, i32 6
@@ -310,18 +342,37 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
}
define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
-; SSE-LABEL: insert_v8i16_z12345z7:
-; SSE: # BB#0:
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: pinsrw $0, %eax, %xmm0
-; SSE-NEXT: pinsrw $6, %eax, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v8i16_z12345z7:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: pinsrw $0, %eax, %xmm0
+; SSE2-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v8i16_z12345z7:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v8i16_z12345z7:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v8i16_z12345z7:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v8i16_z12345z7:
; AVX: # BB#0:
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> %a, i16 0, i32 0
%2 = insertelement <8 x i16> %1, i16 0, i32 6
@@ -329,35 +380,58 @@ define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
}
define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) {
-; SSE-LABEL: insert_v16i16_z12345z789ABZDEz:
-; SSE: # BB#0:
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: pinsrw $0, %eax, %xmm0
-; SSE-NEXT: pinsrw $6, %eax, %xmm0
-; SSE-NEXT: pinsrw $7, %eax, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: pinsrw $0, %eax, %xmm0
+; SSE2-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-NEXT: pinsrw $7, %eax, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorl %eax, %eax
+; SSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSE3-NEXT: pinsrw $7, %eax, %xmm1
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorl %eax, %eax
+; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $7, %eax, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v16i16_z12345z789ABZDEz:
+; SSE41: # BB#0:
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
+; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v16i16_z12345z789ABZDEz:
; AVX1: # BB#0:
-; AVX1-NEXT: xorl %eax, %eax
-; AVX1-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v16i16_z12345z789ABZDEz:
; AVX2: # BB#0:
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%1 = insertelement <16 x i16> %a, i16 0, i32 0
diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll
index f2596b6347b9..b21fdec624bc 100644
--- a/test/CodeGen/X86/insertps-combine.ll
+++ b/test/CodeGen/X86/insertps-combine.ll
@@ -6,16 +6,12 @@
define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
; SSE-LABEL: shuffle_v4f32_0z27:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z27:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -28,16 +24,12 @@ define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0zz4:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz4:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -50,16 +42,12 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0z24:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -72,17 +60,12 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
define <4 x float> @shuffle_v4f32_0zz0(float %a) {
; SSE-LABEL: shuffle_v4f32_0zz0:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,0]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz0:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; AVX-NEXT: retq
%vecinit = insertelement <4 x float> undef, float %a, i32 0
%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
@@ -110,6 +93,132 @@ define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
ret <4 x float> %vecinit4
}
+define <4 x float> @insertps_undef_input0(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: insertps_undef_input0:
+; SSE: # BB#0:
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_undef_input0:
+; AVX: # BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
+; AVX-NEXT: retq
+ %res0 = fadd <4 x float> %a0, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %res1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %res0, <4 x float> %a1, i8 21)
+ %res2 = shufflevector <4 x float> %res1, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %res2
+}
+
+define <4 x float> @insertps_undef_input1(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: insertps_undef_input1:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_undef_input1:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: retq
+ %res0 = fadd <4 x float> %a1, <float 1.0, float 1.0, float 1.0, float 1.0>
+ %res1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %res0, i8 21)
+ %res2 = shufflevector <4 x float> %res1, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+ ret <4 x float> %res2
+}
+
+define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1) nounwind {
+; SSE-LABEL: insertps_zero_from_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movapd (%rdi), %xmm1
+; SSE-NEXT: addpd {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movapd %xmm1, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_zero_from_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovapd (%rdi), %xmm1
+; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovapd %xmm1, (%rdi)
+; AVX-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %a1
+ %2 = bitcast <2 x double> <double 1.0, double 2.0> to <4 x float>
+ %3 = fadd <2 x double> %1, <double 1.0, double 2.0>
+ %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 6, i32 2, i32 2, i32 3>
+ store <2 x double> %3, <2 x double> *%a1
+ ret <4 x float> %4
+}
+
+define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) nounwind {
+; SSE-LABEL: insertps_zero_from_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movdqa %xmm1, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_zero_from_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %a1
+ %2 = bitcast <2 x i64> <i64 1, i64 -2> to <4 x float>
+ %3 = add <2 x i64> %1, <i64 1, i64 -2>
+ %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 5, i32 2, i32 2, i32 3>
+ store <2 x i64> %3, <2 x i64> *%a1
+ ret <4 x float> %4
+}
+
+define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) nounwind {
+; SSE-LABEL: insertps_zero_from_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
+; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; SSE-NEXT: movdqa %xmm1, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: insertps_zero_from_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
+; AVX-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a1
+ %2 = bitcast <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3> to <4 x float>
+ %3 = add <8 x i16> %1, <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3>
+ %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 4, i32 2, i32 2, i32 3>
+ store <8 x i16> %3, <8 x i16> *%a1
+ ret <4 x float> %4
+}
+
+define <4 x float> @consecutive_load_insertps_04zz(float* %p) {
+; SSE-LABEL: consecutive_load_insertps_04zz:
+; SSE: # BB#0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: consecutive_load_insertps_04zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+ %p0 = getelementptr inbounds float, float* %p, i64 1
+ %p1 = getelementptr inbounds float, float* %p, i64 2
+ %s0 = load float, float* %p0
+ %s1 = load float, float* %p1
+ %v0 = insertelement <4 x float> undef, float %s0, i32 0
+ %v1 = insertelement <4 x float> undef, float %s1, i32 0
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v0, <4 x float> %v1, i8 28)
+ ret <4 x float> %res
+}
+
define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: extract_zero_insertps_z0z7:
; SSE: # BB#0:
diff --git a/test/CodeGen/X86/interval-update-remat.ll b/test/CodeGen/X86/interval-update-remat.ll
new file mode 100644
index 000000000000..4e80e34c9479
--- /dev/null
+++ b/test/CodeGen/X86/interval-update-remat.ll
@@ -0,0 +1,161 @@
+; RUN: llc -verify-regalloc -verify-machineinstrs < %s
+; PR27275: When enabling remat for vreg defined by PHIs, make sure the update
+; of the live range removes dead phi. Otherwise, we may end up with PHIs with
+; incorrect operands and that will trigger assertions or verifier failures
+; in later passes.
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@b = external global i64, align 8
+@d = external global i32, align 4
+@e = external global i64, align 8
+@h = external global i16, align 2
+@a = external global i8, align 1
+@g = external global i64, align 8
+@j = external global i32, align 4
+@f = external global i16, align 2
+@.str = external unnamed_addr constant [12 x i8], align 1
+
+define void @fn1() {
+entry:
+ %tmp = load i64, i64* @b, align 8
+ %or = or i64 0, 3299921317
+ %and = and i64 %or, %tmp
+ %tmp1 = load i32, i32* @d, align 4
+ br i1 undef, label %lor.rhs, label %lor.end
+
+lor.rhs: ; preds = %entry
+ %tobool3 = icmp ne i8 undef, 0
+ br label %lor.end
+
+lor.end: ; preds = %lor.rhs, %entry
+ %lor.ext = zext i1 undef to i32
+ %tmp2 = load i64, i64* @e, align 8
+ br i1 undef, label %lor.rhs5, label %lor.end7
+
+lor.rhs5: ; preds = %lor.end
+ br label %lor.end7
+
+lor.end7: ; preds = %lor.rhs5, %lor.end
+ %tmp3 = phi i1 [ true, %lor.end ], [ false, %lor.rhs5 ]
+ %neg13 = xor i64 %tmp, -1
+ %conv25 = zext i1 %tmp3 to i32
+ %tobool46 = icmp eq i64 %tmp, 0
+ %.pre = load i16, i16* @h, align 2
+ %tobool10 = icmp eq i16 %.pre, 0
+ %neg.us = xor i32 %tmp1, -1
+ %conv12.us = sext i32 %neg.us to i64
+ %tobool23.us = icmp eq i64 %tmp2, %and
+ %conv39.us = sext i32 %tmp1 to i64
+ br label %LABEL_mSmSDb
+
+LABEL_mSmSDb.loopexit: ; preds = %lor.end32.us
+ %conv42.us.lcssa = phi i32 [ %conv42.us, %lor.end32.us ]
+ store i64 undef, i64* @g, align 8
+ br label %LABEL_mSmSDb
+
+LABEL_mSmSDb: ; preds = %LABEL_mSmSDb.loopexit, %lor.end7
+ %tmp4 = phi i32 [ undef, %lor.end7 ], [ %conv42.us.lcssa, %LABEL_mSmSDb.loopexit ]
+ %tmp5 = phi i64 [ %tmp, %lor.end7 ], [ 0, %LABEL_mSmSDb.loopexit ]
+ br i1 %tobool10, label %LABEL_BRBRN.preheader, label %if.then
+
+if.then: ; preds = %LABEL_mSmSDb
+ store i8 undef, i8* @a, align 1
+ br label %LABEL_BRBRN.preheader
+
+LABEL_BRBRN.preheader: ; preds = %if.then, %LABEL_mSmSDb
+ %.pre63 = load i64, i64* @g, align 8
+ br i1 %tobool46, label %LABEL_BRBRN.us, label %LABEL_BRBRN.outer
+
+LABEL_BRBRN.outer: ; preds = %if.then47, %LABEL_BRBRN.preheader
+ %.ph = phi i32 [ 0, %if.then47 ], [ %tmp4, %LABEL_BRBRN.preheader ]
+ %.ph64 = phi i32 [ %conv50, %if.then47 ], [ %tmp1, %LABEL_BRBRN.preheader ]
+ %.ph65 = phi i64 [ %tmp16, %if.then47 ], [ %.pre63, %LABEL_BRBRN.preheader ]
+ %.ph66 = phi i64 [ 0, %if.then47 ], [ %tmp2, %LABEL_BRBRN.preheader ]
+ %.ph67 = phi i64 [ %.pre56.pre, %if.then47 ], [ %tmp5, %LABEL_BRBRN.preheader ]
+ %neg = xor i32 %.ph64, -1
+ %conv12 = sext i32 %neg to i64
+ %tobool23 = icmp eq i64 %.ph66, %and
+ %tmp6 = load i32, i32* @j, align 4
+ %shr = lshr i32 %conv25, %tmp6
+ %conv39 = sext i32 %.ph64 to i64
+ br label %LABEL_BRBRN
+
+LABEL_BRBRN.us: ; preds = %lor.end32.us, %LABEL_BRBRN.preheader
+ %tmp7 = phi i32 [ %conv42.us, %lor.end32.us ], [ %tmp4, %LABEL_BRBRN.preheader ]
+ %tmp8 = phi i64 [ undef, %lor.end32.us ], [ %.pre63, %LABEL_BRBRN.preheader ]
+ %tmp9 = phi i64 [ %tmp10, %lor.end32.us ], [ %tmp5, %LABEL_BRBRN.preheader ]
+ %mul.us = mul i64 %tmp8, %neg13
+ %mul14.us = mul i64 %mul.us, %conv12.us
+ %cmp.us = icmp sgt i64 %tmp2, %mul14.us
+ %conv16.us = zext i1 %cmp.us to i64
+ %xor.us = xor i64 %conv16.us, %tmp9
+ %rem18.us = urem i32 %lor.ext, %tmp7
+ %conv19.us = zext i32 %rem18.us to i64
+ br i1 %tobool23.us, label %lor.rhs24.us, label %lor.end32.us
+
+lor.rhs24.us: ; preds = %LABEL_BRBRN.us
+ br label %lor.end32.us
+
+lor.end32.us: ; preds = %lor.rhs24.us, %LABEL_BRBRN.us
+ %tmp10 = phi i64 [ -2, %LABEL_BRBRN.us ], [ -1, %lor.rhs24.us ]
+ %xor.us.not = xor i64 %xor.us, -1
+ %neg36.us = and i64 %conv19.us, %xor.us.not
+ %conv37.us = zext i32 %tmp7 to i64
+ %sub38.us = sub nsw i64 %neg36.us, %conv37.us
+ %mul40.us = mul nsw i64 %sub38.us, %conv39.us
+ %neg41.us = xor i64 %mul40.us, 4294967295
+ %conv42.us = trunc i64 %neg41.us to i32
+ %tobool43.us = icmp eq i8 undef, 0
+ br i1 %tobool43.us, label %LABEL_mSmSDb.loopexit, label %LABEL_BRBRN.us
+
+LABEL_BRBRN: ; preds = %lor.end32, %LABEL_BRBRN.outer
+ %tmp11 = phi i32 [ %conv42, %lor.end32 ], [ %.ph, %LABEL_BRBRN.outer ]
+ %tmp12 = phi i64 [ %neg21, %lor.end32 ], [ %.ph65, %LABEL_BRBRN.outer ]
+ %tmp13 = phi i64 [ %conv35, %lor.end32 ], [ %.ph67, %LABEL_BRBRN.outer ]
+ %mul = mul i64 %tmp12, %neg13
+ %mul14 = mul i64 %mul, %conv12
+ %cmp = icmp sgt i64 %.ph66, %mul14
+ %conv16 = zext i1 %cmp to i64
+ %xor = xor i64 %conv16, %tmp13
+ %rem18 = urem i32 %lor.ext, %tmp11
+ %conv19 = zext i32 %rem18 to i64
+ %neg21 = or i64 %xor, undef
+ br i1 %tobool23, label %lor.rhs24, label %lor.end32
+
+lor.rhs24: ; preds = %LABEL_BRBRN
+ %tmp14 = load volatile i16, i16* @f, align 2
+ %conv26 = sext i16 %tmp14 to i32
+ %and27 = and i32 %conv26, %shr
+ %conv28 = sext i32 %and27 to i64
+ %mul29 = mul nsw i64 %conv28, %tmp
+ %and30 = and i64 %mul29, %tmp13
+ %tobool31 = icmp ne i64 %and30, 0
+ br label %lor.end32
+
+lor.end32: ; preds = %lor.rhs24, %LABEL_BRBRN
+ %tmp15 = phi i1 [ true, %LABEL_BRBRN ], [ %tobool31, %lor.rhs24 ]
+ %lor.ext33 = zext i1 %tmp15 to i32
+ %neg34 = xor i32 %lor.ext33, -1
+ %conv35 = sext i32 %neg34 to i64
+ %xor.not = xor i64 %xor, -1
+ %neg36 = and i64 %conv19, %xor.not
+ %conv37 = zext i32 %tmp11 to i64
+ %sub38 = sub nsw i64 %neg36, %conv37
+ %mul40 = mul nsw i64 %sub38, %conv39
+ %neg41 = xor i64 %mul40, 4294967295
+ %conv42 = trunc i64 %neg41 to i32
+ %tobool43 = icmp eq i8 undef, 0
+ br i1 %tobool43, label %if.then47, label %LABEL_BRBRN
+
+if.then47: ; preds = %lor.end32
+ tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i64 %conv39)
+ %tmp16 = load i64, i64* @g, align 8
+ %neg49 = xor i64 %tmp16, 4294967295
+ %conv50 = trunc i64 %neg49 to i32
+ %.pre56.pre = load i64, i64* @b, align 8
+ br label %LABEL_BRBRN.outer
+}
+
+declare void @printf(i8* nocapture readonly, ...)
diff --git a/test/CodeGen/X86/ipra-inline-asm.ll b/test/CodeGen/X86/ipra-inline-asm.ll
new file mode 100644
index 000000000000..e70b149e19e1
--- /dev/null
+++ b/test/CodeGen/X86/ipra-inline-asm.ll
@@ -0,0 +1,20 @@
+; RUN: llc -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Verify that bar does not clobber anything
+; CHECK-NOT: bar Clobbered Registers:{{.+}}
+; CHECK: bar Clobbered Registers:
+define void @bar() #0 {
+ ret void
+}
+
+; Verifies that inline assembly is correctly handled by giving a list of clobbered registers
+; CHECK: foo Clobbered Registers: AH AL AX CH CL CX DI DIL EAX ECX EDI RAX RCX RDI
+define void @foo() #0 {
+ call void asm sideeffect "", "~{eax},~{ecx},~{edi}"() #0
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/ipra-local-linkage.ll b/test/CodeGen/X86/ipra-local-linkage.ll
new file mode 100644
index 000000000000..a394ed3e3858
--- /dev/null
+++ b/test/CodeGen/X86/ipra-local-linkage.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s | FileCheck %s -check-prefix=NOIPRA
+; RUN: llc -enable-ipra < %s | FileCheck %s
+
+target triple = "x86_64--"
+
+define internal void @foo() norecurse {
+; When IPRA is not enabled R15 will be saved by foo as it is callee saved reg.
+; NOIPRA-LABEL: foo:
+; NOIPRA: pushq %r15
+; When IPRA is enabled none register should be saved as foo() is local function
+; so we optimize it to save no registers.
+; CHECK-LABEL: foo:
+; CHECK-NOT: pushq %r15
+ call void asm sideeffect "movl %r14d, %r15d", "~{r15}"()
+ ret void
+}
+
+define void @bar(i32 %X) {
+ call void asm sideeffect "movl %r12d, $0", "{r15}~{r12}"(i32 %X)
+ ; As R15 is clobbered by foo() when IPRA is enabled value of R15 should be
+ ; saved if register containing orignal value is also getting clobbered
+ ; and reloaded after foo(), here original value is loaded back into R15D after
+ ; call to foo.
+ call void @foo()
+ ; CHECK-LABEL: bar:
+ ; CHECK: callq foo
+ ; CHECK-NEXT: movl %eax, %r15d
+ call void asm sideeffect "movl $0, %r12d", "{r15}~{r12}"(i32 %X)
+ ret void
+}
diff --git a/test/CodeGen/X86/ipra-reg-usage.ll b/test/CodeGen/X86/ipra-reg-usage.ll
new file mode 100644
index 000000000000..ca97472bb820
--- /dev/null
+++ b/test/CodeGen/X86/ipra-reg-usage.ll
@@ -0,0 +1,12 @@
+; RUN: llc -enable-ipra -print-regusage -o /dev/null 2>&1 < %s | FileCheck %s
+
+target triple = "x86_64-unknown-unknown"
+declare void @bar1()
+define preserve_allcc void @foo()#0 {
+; CHECK: foo Clobbered Registers: CS DS EFLAGS EIP EIZ ES FPSW FS GS IP RIP RIZ SS BND0 BND1 BND2 BND3 CR0 CR1 CR2 CR3 CR4 CR5 CR6 CR7 CR8 CR9 CR10 CR11 CR12 CR13 CR14 CR15 DR0 DR1 DR2 DR3 DR4 DR5 DR6 DR7 DR8 DR9 DR10 DR11 DR12 DR13 DR14 DR15 FP0 FP1 FP2 FP3 FP4 FP5 FP6 FP7 K0 K1 K2 K3 K4 K5 K6 K7 MM0 MM1 MM2 MM3 MM4 MM5 MM6 MM7 R11 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7 XMM16 XMM17 XMM18 XMM19 XMM20 XMM21 XMM22 XMM23 XMM24 XMM25 XMM26 XMM27 XMM28 XMM29 XMM30 XMM31 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15 YMM16 YMM17 YMM18 YMM19 YMM20 YMM21 YMM22 YMM23 YMM24 YMM25 YMM26 YMM27 YMM28 YMM29 YMM30 YMM31 ZMM0 ZMM1 ZMM2 ZMM3 ZMM4 ZMM5 ZMM6 ZMM7 ZMM8 ZMM9 ZMM10 ZMM11 ZMM12 ZMM13 ZMM14 ZMM15 ZMM16 ZMM17 ZMM18 ZMM19 ZMM20 ZMM21 ZMM22 ZMM23 ZMM24 ZMM25 ZMM26 ZMM27 ZMM28 ZMM29 ZMM30 ZMM31 R11B R11D R11W
+ call void @bar1()
+ call void @bar2()
+ ret void
+}
+declare void @bar2()
+attributes #0 = {nounwind}
diff --git a/test/CodeGen/X86/ipra-transform.ll b/test/CodeGen/X86/ipra-transform.ll
new file mode 100644
index 000000000000..362af8812346
--- /dev/null
+++ b/test/CodeGen/X86/ipra-transform.ll
@@ -0,0 +1,32 @@
+
+; RUN: llc < %s | FileCheck %s -check-prefix=NOIPRA
+; RUN: llc -enable-ipra < %s | FileCheck %s
+
+
+target triple = "x86_64-unknown-unknown"
+define void @bar1() {
+ ret void
+}
+define preserve_allcc void @foo()#0 {
+; Due to preserve_allcc foo() will save some registers at start of foo()
+; prefix NOIPRA will verify that.
+; NOIPRA-LABEL: foo:
+; NOIPRA: pushq %r10
+; NOIPRA-NEXT: pushq %r9
+; NOIPRA-NEXT: pushq %r8
+; NOIPRA: callq bar1
+; When IPRA is present above registers will not be saved and that is verified
+; by prefix CHECK.
+; CHECK: foo:
+; CHECK-NOT: pushq %r10
+; CHECK-NOT: pushq %r9
+; CHECK-NOT: pushq %r8
+; CHECK: callq bar1
+ call void @bar1()
+ call void @bar2()
+ ret void
+}
+define void @bar2() {
+ ret void
+}
+attributes #0 = {nounwind}
diff --git a/test/CodeGen/X86/lakemont.ll b/test/CodeGen/X86/lakemont.ll
new file mode 100644
index 000000000000..ddd24525f27a
--- /dev/null
+++ b/test/CodeGen/X86/lakemont.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86 -mcpu=lakemont | FileCheck %s
+
+; Make sure -mcpu=lakemont implies soft floats.
+define float @test(float %a, float %b) nounwind readnone {
+; CHECK-LABEL: test:
+; CHECK: __addsf3
+ %add = fadd float %a, %b
+ ret float %add
+}
diff --git a/test/CodeGen/X86/lea-opt-memop-check-1.ll b/test/CodeGen/X86/lea-opt-memop-check-1.ll
new file mode 100644
index 000000000000..08e510772a88
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=x86 -mtriple=i686-pc-win32 | FileCheck %s
+
+; PR26575
+; Assertion `(Disp->isImm() || Disp->isGlobal()) && (Other.Disp->isImm() || Other.Disp->isGlobal()) && "Address displacement operand is always an immediate or a global"' failed.
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) argmemonly nounwind
+declare <2 x i64> @_mm_xor_si128(<2 x i64>, <2 x i64>) optsize
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+declare <4 x float> @_mm_castsi128_ps(<2 x i64>) optsize
+
+; Check that the LEA optimization pass works with CPI address displacements.
+define void @test1(i8* nocapture readonly %src, i32 %len) #0 {
+ %parts = alloca [4 x i32], align 4
+ %part0 = bitcast [4 x i32]* %parts to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %part0, i8* %src, i32 %len, i32 1, i1 false)
+ %call0 = tail call <2 x i64> @_mm_xor_si128(<2 x i64> undef, <2 x i64> <i64 -9187201950435737472, i64 -9187201950435737472>)
+ %tmp0 = tail call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> undef, <2 x i64> <i64 7631803798, i64 5708721108>, i8 16)
+ %call1 = tail call <4 x float> @_mm_castsi128_ps(<2 x i64> %tmp0)
+ ret void
+; CHECK-LABEL: test1:
+; CHECK: leal{{.*}}
+; CHECK: calll _memcpy
+; CHECK: movaps __xmm@{{[0-9a-f]+}}, %xmm1
+; CHECK: calll __mm_xor_si128
+; CHECK: pclmulqdq $16, __xmm@{{[0-9a-f]+}}, %xmm0
+; CHECK: jmp __mm_castsi128_ps
+}
+
+declare i32 @GetLastError(...)
+declare void @IsolationAwareDeactivateActCtx(i32, i32)
+declare i8* @llvm.localaddress()
+declare void @llvm.localescape(...)
+declare i8* @llvm.localrecover(i8*, i8*, i32)
+
+@IsolationAwarePrivateT_SqbjaYRiRY = common global i32 0, align 4
+
+; Check that the MCSymbol objects are created to be used in "\01?fin$0@0@test2@@".
+define void @test2() #0 {
+entry:
+ %fActivateActCtxSuccess = alloca i32, align 4
+ %proc = alloca i32, align 4
+ %ulpCookie = alloca i32, align 4
+ call void (...) @llvm.localescape(i32* nonnull %fActivateActCtxSuccess, i32* nonnull %proc, i32* nonnull %ulpCookie)
+ %tmp0 = tail call i8* @llvm.localaddress()
+ call fastcc void @"\01?fin$0@0@test2@@"(i8* %tmp0)
+ ret void
+; CHECK-LABEL: test2:
+; CHECK: Ltest2$frame_escape_0 = 8
+; CHECK: Ltest2$frame_escape_1 = 4
+; CHECK: Ltest2$frame_escape_2 = 0
+; CHECK: calll "?fin$0@0@test2@@"
+}
+
+; Check that the LEA optimization pass works with MCSymbol address displacements.
+define internal fastcc void @"\01?fin$0@0@test2@@"(i8* readonly %frame_pointer) unnamed_addr noinline nounwind optsize {
+entry:
+ %tmp0 = tail call i8* @llvm.localrecover(i8* bitcast (void ()* @test2 to i8*), i8* %frame_pointer, i32 1)
+ %proc = bitcast i8* %tmp0 to i32*
+ %tmp1 = tail call i8* @llvm.localrecover(i8* bitcast (void ()* @test2 to i8*), i8* %frame_pointer, i32 2)
+ %ulpCookie = bitcast i8* %tmp1 to i32*
+ %tmp2 = load i32, i32* @IsolationAwarePrivateT_SqbjaYRiRY, align 4
+ %tobool = icmp eq i32 %tmp2, 0
+ br i1 %tobool, label %if.end, label %land.lhs.true
+
+land.lhs.true:
+ %tmp3 = tail call i8* @llvm.localrecover(i8* bitcast (void ()* @test2 to i8*), i8* %frame_pointer, i32 0)
+ %fActivateActCtxSuccess = bitcast i8* %tmp3 to i32*
+ %tmp4 = load i32, i32* %fActivateActCtxSuccess, align 4
+ %tobool1 = icmp eq i32 %tmp4, 0
+ br i1 %tobool1, label %if.end, label %if.then
+
+if.then:
+ %tmp5 = load i32, i32* %proc, align 4
+ %tobool2 = icmp eq i32 %tmp5, 0
+ br i1 %tobool2, label %cond.end, label %cond.true
+
+cond.true:
+ %call = tail call i32 bitcast (i32 (...)* @GetLastError to i32 ()*)()
+ br label %cond.end
+
+cond.end:
+ %tmp6 = load i32, i32* %ulpCookie, align 4
+ tail call void @IsolationAwareDeactivateActCtx(i32 0, i32 %tmp6)
+ br label %if.end
+
+if.end:
+ ret void
+; CHECK-LABEL: "?fin$0@0@test2@@":
+; CHECK: cmpl $0, Ltest2$frame_escape_0([[REG1:%[a-z]+]])
+; CHECK: leal Ltest2$frame_escape_1([[REG1]]), [[REG2:%[a-z]+]]
+; CHECK: leal Ltest2$frame_escape_2([[REG1]]), [[REG3:%[a-z]+]]
+; CHECK: cmpl $0, ([[REG2]])
+; CHECK: pushl ([[REG3]])
+}
+
+attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/lea-opt-memop-check-2.ll b/test/CodeGen/X86/lea-opt-memop-check-2.ll
new file mode 100644
index 000000000000..f3fc95f8be3c
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-memop-check-2.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck %s
+
+; PR27502
+; UNREACHABLE: "Invalid address displacement operand"
+
+@buf = internal global [5 x i8*] zeroinitializer
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+
+define i32 @test() nounwind optsize {
+ %r = tail call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+ ret i32 %r
+; CHECK-LABEL: test:
+; CHECK: leaq .LBB0_3(%rip), %r[[REG:[a-z]+]]
+; CHECK: movq %r[[REG]], buf+8(%rip)
+; CHECK: #EH_SjLj_Setup .LBB0_3
+; CHECK: xorl %e[[REG]], %e[[REG]]
+; CHECK: jmp .LBB0_2
+; CHECK-LABEL: .LBB0_3: # Block address taken
+; CHECK-LABEL: .LBB0_2:
+}
diff --git a/test/CodeGen/X86/lea-opt.ll b/test/CodeGen/X86/lea-opt.ll
index 8096bfabd6cf..9e0e34b1e09a 100644
--- a/test/CodeGen/X86/lea-opt.ll
+++ b/test/CodeGen/X86/lea-opt.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -enable-x86-lea-opt | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK -check-prefix=ENABLED
+; RUN: llc --disable-x86-lea-opt < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK -check-prefix=DISABLED
%struct.anon1 = type { i32, i32, i32 }
%struct.anon2 = type { i32, [32 x i32], i32 }
@@ -34,16 +35,18 @@ sw.bb.2: ; preds = %entry
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
ret void
; CHECK-LABEL: test1:
-; CHECK: leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
-; CHECK: movl arr1(,[[REG1]],4), {{.*}}
-; CHECK: leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
-; CHECK: subl arr1+4(,[[REG1]],4), {{.*}}
-; CHECK: leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
-; CHECK: addl arr1+8(,[[REG1]],4), {{.*}}
+; CHECK: shlq $2, [[REG1:%[a-z]+]]
+; CHECK: movl arr1([[REG1]],[[REG1]],2), {{.*}}
+; CHECK: leaq arr1+4([[REG1]],[[REG1]],2), [[REG2:%[a-z]+]]
+; CHECK: subl arr1+4([[REG1]],[[REG1]],2), {{.*}}
+; DISABLED: leaq arr1+8([[REG1]],[[REG1]],2), [[REG3:%[a-z]+]]
+; CHECK: addl arr1+8([[REG1]],[[REG1]],2), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
}
define void @test2(i64 %x) nounwind optsize {
@@ -74,16 +77,21 @@ sw.bb.2: ; preds = %entry
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
ret void
; CHECK-LABEL: test2:
-; CHECK: leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
-; CHECK: leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
-; CHECK: movl -4([[REG2]]), {{.*}}
-; CHECK: subl ([[REG2]]), {{.*}}
-; CHECK: leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
-; CHECK: addl ([[REG3]]), {{.*}}
+; CHECK: shlq $2, [[REG1:%[a-z]+]]
+; DISABLED: movl arr1([[REG1]],[[REG1]],2), {{.*}}
+; CHECK: leaq arr1+4([[REG1]],[[REG1]],2), [[REG2:%[a-z]+]]
+; ENABLED: movl -4([[REG2]]), {{.*}}
+; ENABLED: subl ([[REG2]]), {{.*}}
+; ENABLED: addl 4([[REG2]]), {{.*}}
+; DISABLED: subl arr1+4([[REG1]],[[REG1]],2), {{.*}}
+; DISABLED: leaq arr1+8([[REG1]],[[REG1]],2), [[REG3:%[a-z]+]]
+; DISABLED: addl arr1+8([[REG1]],[[REG1]],2), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
}
; Check that LEA optimization pass takes into account a resultant address
@@ -109,7 +117,9 @@ sw.bb.1: ; preds = %entry
sw.bb.2: ; preds = %entry
store i32 333, i32* %a, align 4
- store i32 444, i32* %b, align 4
+ ; Make sure the REG3's definition LEA won't be removed as redundant.
+ %cvt = ptrtoint i32* %b to i32
+ store i32 %cvt, i32* %b, align 4
br label %sw.epilog
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
@@ -122,12 +132,14 @@ sw.epilog: ; preds = %sw.bb.2, %sw.bb.1,
; REG3's definition is closer to movl than REG2's, but the pass still chooses
; REG2 because it provides the resultant address displacement fitting 1 byte.
-; CHECK: movl ([[REG2]]), {{.*}}
-; CHECK: addl ([[REG3]]), {{.*}}
+; ENABLED: movl ([[REG2]]), {{.*}}
+; ENABLED: addl ([[REG3]]), {{.*}}
+; DISABLED: movl arr2+132([[REG1]]), {{.*}}
+; DISABLED: addl arr2([[REG1]]), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
; CHECK: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, ([[REG3]])
+; CHECK: movl {{.*}}, ([[REG3]])
}
define void @test4(i64 %x) nounwind minsize {
@@ -158,12 +170,19 @@ sw.bb.2: ; preds = %entry
sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
ret void
; CHECK-LABEL: test4:
-; CHECK: leaq arr1+4({{.*}}), [[REG2:%[a-z]+]]
-; CHECK: movl -4([[REG2]]), {{.*}}
-; CHECK: subl ([[REG2]]), {{.*}}
-; CHECK: addl 4([[REG2]]), {{.*}}
+; CHECK: imulq {{.*}}, [[REG1:%[a-z]+]]
+; DISABLED: movl arr1([[REG1]]), {{.*}}
+; CHECK: leaq arr1+4([[REG1]]), [[REG2:%[a-z]+]]
+; ENABLED: movl -4([[REG2]]), {{.*}}
+; ENABLED: subl ([[REG2]]), {{.*}}
+; ENABLED: addl 4([[REG2]]), {{.*}}
+; DISABLED: subl arr1+4([[REG1]]), {{.*}}
+; DISABLED: leaq arr1+8([[REG1]]), [[REG3:%[a-z]+]]
+; DISABLED: addl arr1+8([[REG1]]), {{.*}}
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, 4([[REG2]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
; CHECK: movl ${{[1-4]+}}, ([[REG2]])
-; CHECK: movl ${{[1-4]+}}, 4([[REG2]])
+; ENABLED: movl ${{[1-4]+}}, 4([[REG2]])
+; DISABLED: movl ${{[1-4]+}}, ([[REG3]])
}
diff --git a/test/CodeGen/X86/libcall-sret.ll b/test/CodeGen/X86/libcall-sret.ll
index 67b99ac239cd..4ef0a78ad798 100644
--- a/test/CodeGen/X86/libcall-sret.ll
+++ b/test/CodeGen/X86/libcall-sret.ll
@@ -10,14 +10,25 @@ define void @test_sret_libcall(i128 %l, i128 %r) {
; CHECK-LABEL: test_sret_libcall:
; Stack for call: 4(sret ptr), 16(i128 %l), 16(128 %r). So next logical
- ; (aligned) place for the actual sret data is %esp + 40.
-; CHECK: leal 40(%esp), [[SRET_ADDR:%[a-z]+]]
-; CHECK: movl [[SRET_ADDR]], (%esp)
+ ; (aligned) place for the actual sret data is %esp + 20.
+; CHECK: leal 20(%esp), [[SRET_ADDR:%[a-z]+]]
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl 72(%esp)
+; CHECK: pushl [[SRET_ADDR]]
+
; CHECK: calll __multi3
-; CHECK-DAG: movl 40(%esp), [[RES0:%[a-z]+]]
-; CHECK-DAG: movl 44(%esp), [[RES1:%[a-z]+]]
-; CHECK-DAG: movl 48(%esp), [[RES2:%[a-z]+]]
-; CHECK-DAG: movl 52(%esp), [[RES3:%[a-z]+]]
+
+; CHECK: addl $44, %esp
+; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
+; CHECK-DAG: movl 12(%esp), [[RES1:%[a-z]+]]
+; CHECK-DAG: movl 16(%esp), [[RES2:%[a-z]+]]
+; CHECK-DAG: movl 20(%esp), [[RES3:%[a-z]+]]
; CHECK-DAG: movl [[RES0]], var
; CHECK-DAG: movl [[RES1]], var+4
; CHECK-DAG: movl [[RES2]], var+8
diff --git a/test/CodeGen/X86/licm-dominance.ll b/test/CodeGen/X86/licm-dominance.ll
index 7e3c6fdf9514..f6f563c9bcb6 100644
--- a/test/CodeGen/X86/licm-dominance.ll
+++ b/test/CodeGen/X86/licm-dominance.ll
@@ -1,36 +1,55 @@
; RUN: llc -asm-verbose=true < %s | FileCheck %s
; MachineLICM should check dominance before hoisting instructions.
+; only the load of a0 is guaranteed to execute, so only it can be hoisted.
+; CHECK: movb (%rdi), [[a0reg:%[a-z0-9]+]]
+; CHECK: ## %for.body.i
+; CHECK: testb [[a0reg]], [[a0reg]]
; CHECK: ## in Loop:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %al, %al
+; CHECK: cmpb $1, ({{%[a-z0-9]+}})
+; CHECK: cmpb $2, ({{%[a-z0-9]+}})
+; CHECK: cmpb $3, ({{%[a-z0-9]+}})
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-macosx10.7.2"
-define void @CMSColorWorldCreateParametricData() nounwind uwtable optsize ssp {
+define void @CMSColorWorldCreateParametricData(
+ i8* dereferenceable(1) %a0,
+ i8* dereferenceable(1) %a1,
+ i8* dereferenceable(1) %a2,
+ i8* dereferenceable(1) %a3,
+ i64 %count) nounwind uwtable optsize ssp readonly {
entry:
br label %for.body.i
-for.body.i:
- br i1 undef, label %for.inc.i, label %if.then26.i
-
-if.then26.i:
- br i1 undef, label %if.else.i.i, label %lor.lhs.false.i.i
-
-if.else.i.i:
- br i1 undef, label %lor.lhs.false.i.i, label %if.then116.i.i
-
-lor.lhs.false.i.i:
- br i1 undef, label %for.inc.i, label %if.then116.i.i
-
-if.then116.i.i:
- unreachable
-
-for.inc.i:
- %cmp17.i = icmp ult i64 undef, undef
+for.body.i:
+ %i = phi i64 [0, %entry], [%i.inc, %for.inc.i]
+ %0 = load i8, i8* %a0, !invariant.load !0
+ %cond0 = icmp eq i8 %0, 0
+ br i1 %cond0, label %for.inc.i, label %if.then26.i
+
+if.then26.i:
+ %1 = load i8, i8* %a1, !invariant.load !0
+ %cond1 = icmp eq i8 %1, 1
+ br i1 %cond1, label %if.else.i.i, label %lor.lhs.false.i.i
+
+if.else.i.i:
+ %2 = load i8, i8* %a2, !invariant.load !0
+ %cond2 = icmp eq i8 %2, 2
+ br i1 %cond2, label %lor.lhs.false.i.i, label %for.inc.i
+
+lor.lhs.false.i.i:
+ %3 = load i8, i8* %a3, !invariant.load !0
+ %cond3 = icmp eq i8 %3, 3
+ br i1 %cond3, label %for.inc.i, label %if.end28.i
+
+for.inc.i:
+ %i.inc = add nsw i64 %i, 1
+ %cmp17.i = icmp ult i64 %i.inc, %count
br i1 %cmp17.i, label %for.body.i, label %if.end28.i
-if.end28.i:
+if.end28.i:
ret void
}
+
+!0 = !{}
diff --git a/test/CodeGen/X86/licm-symbol.ll b/test/CodeGen/X86/licm-symbol.ll
index 0f115ddbb6c2..050289e27c90 100644
--- a/test/CodeGen/X86/licm-symbol.ll
+++ b/test/CodeGen/X86/licm-symbol.ll
@@ -6,7 +6,7 @@
; CHECK: pushl
; CHECK: movl $176, %esi
; CHECK: addl L___sF$non_lazy_ptr, %esi
-; CHECK: .align 4, 0x90
+; CHECK: .p2align 4, 0x90
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
target triple = "i386-apple-darwin8"
diff --git a/test/CodeGen/X86/loc-remat.ll b/test/CodeGen/X86/loc-remat.ll
new file mode 100644
index 000000000000..d91ba4b99267
--- /dev/null
+++ b/test/CodeGen/X86/loc-remat.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = common global i32 0, align 4
+
+define i32 @main() !dbg !4 {
+entry:
+ %0 = load volatile i32, i32* @x, align 4, !dbg !9, !tbaa !10
+ %add = add nsw i32 %0, 24, !dbg !9
+ store volatile i32 %add, i32* @x, align 4, !dbg !9, !tbaa !10
+ %1 = load volatile i32, i32* @x, align 4, !dbg !14, !tbaa !10
+ %add1 = add nsw i32 %1, 2, !dbg !14
+ store volatile i32 %add1, i32* @x, align 4, !dbg !14, !tbaa !10
+ %2 = load volatile i32, i32* @x, align 4, !dbg !15, !tbaa !10
+ %add2 = add nsw i32 %2, 3, !dbg !15
+ store volatile i32 %add2, i32* @x, align 4, !dbg !15, !tbaa !10
+ %3 = load volatile i32, i32* @x, align 4, !dbg !16, !tbaa !10
+ %add3 = add nsw i32 %3, 4, !dbg !16
+ store volatile i32 %add3, i32* @x, align 4, !dbg !16, !tbaa !10
+ tail call void @exit(i32 24), !dbg !17
+ unreachable, !dbg !17
+}
+
+; CHECK-LABEL: main:
+; CHECK: .loc 1 3
+; CHECK: .loc 1 4
+; CHECK: .loc 1 5
+; CHECK: .loc 1 6
+; CHECK: .loc 1 7
+; CHECK: .loc 1 8
+; CHECK-NEXT: movl $24, %edi
+; CHECK-NEXT: callq exit
+
+declare void @exit(i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 259383) (llvm/trunk 259385)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "t.c", directory: "/home/majnemer/llvm/src")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, variables: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !DILocation(line: 4, column: 5, scope: !4)
+!10 = !{!11, !11, i64 0}
+!11 = !{!"int", !12, i64 0}
+!12 = !{!"omnipotent char", !13, i64 0}
+!13 = !{!"Simple C/C++ TBAA"}
+!14 = !DILocation(line: 5, column: 5, scope: !4)
+!15 = !DILocation(line: 6, column: 5, scope: !4)
+!16 = !DILocation(line: 7, column: 5, scope: !4)
+!17 = !DILocation(line: 8, column: 3, scope: !4)
diff --git a/test/CodeGen/X86/local_stack_symbol_ordering.ll b/test/CodeGen/X86/local_stack_symbol_ordering.ll
new file mode 100644
index 000000000000..998c14565ce1
--- /dev/null
+++ b/test/CodeGen/X86/local_stack_symbol_ordering.ll
@@ -0,0 +1,184 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X32
+
+; CHECK-LABEL: foo
+
+; Check the functionality of the local stack symbol table ordering
+; heuristics.
+; The test has a bunch of locals of various sizes that are referenced a
+; different number of times.
+;
+; a : 120B, 9 uses, density = 0.075
+; aa : 4000B, 1 use, density = 0.00025
+; b : 4B, 1 use, density = 0.25
+; cc : 4000B, 2 uses density = 0.0005
+; d : 4B, 2 uses density = 0.5
+; e : 4B, 3 uses density = 0.75
+; f : 4B, 4 uses density = 1
+;
+; Given the size, number of uses and calculated density (uses / size), we're
+; going to hope that f gets allocated closest to the stack pointer,
+; followed by e, d, b, then a (to check for just a few).
+; We use gnu-inline asm between calls to prevent registerization of addresses
+; so that we get exact counts.
+;
+; The test is taken from something like this:
+; void foo()
+; {
+; int f; // 4 uses. 4 / 4 = 1
+; int a[30]; // 9 uses. 8 / 120 = 0.06
+; int aa[1000]; // 1 use. 1 / 4000 =
+; int e; // 3 uses. 3 / 4 = 0.75
+; int cc[1000]; // 2 uses. 2 / 4000 =
+; int b; // 1 use. 1 / 4 = 0.25
+; int d; // 2 uses. 2 / 4 = 0.5
+; int aaa[1000]; // 2 uses. 2 / 4000
+;
+;
+; check_a(&a);
+; bar1(&aaa);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; check_f(&f);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; bar3(&aa, &aaa, &cc);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar2(&a,&cc);
+; check_b(&b);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar2(&a, &f);
+; check_e(&e);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar2(&e, &f);
+; check_d(&d);
+; bar1(&a);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar3(&d, &e, &f);
+; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+; bar1(&a);
+; }
+;
+; X64: leaq 16(%rsp), %rdi
+; X64: callq check_a
+; X64: callq bar1
+; X64: callq bar1
+; X64: leaq (%rsp), %rdi
+; X64: callq check_f
+; X64: callq bar1
+; X64: callq bar3
+; X64: callq bar2
+; X64: leaq 12(%rsp), %rdi
+; X64: callq check_b
+; X64: callq bar1
+; X64: callq bar2
+; X64: leaq 4(%rsp), %rdi
+; X64: callq check_e
+; X64: callq bar1
+; X64: callq bar2
+; X64: leaq 8(%rsp), %rdi
+; X64: callq check_d
+
+; X32: leal 32(%esp)
+; X32: calll check_a
+; X32: calll bar1
+; X32: calll bar1
+; X32: leal 16(%esp)
+; X32: calll check_f
+; X32: calll bar1
+; X32: calll bar3
+; X32: calll bar2
+; X32: leal 28(%esp)
+; X32: calll check_b
+; X32: calll bar1
+; X32: calll bar2
+; X32: leal 20(%esp)
+; X32: calll check_e
+; X32: calll bar1
+; X32: calll bar2
+; X32: leal 24(%esp)
+; X32: calll check_d
+
+
+define void @foo() nounwind uwtable {
+entry:
+ %f = alloca i32, align 4
+ %a = alloca [30 x i32], align 16
+ %aa = alloca [1000 x i32], align 16
+ %e = alloca i32, align 4
+ %cc = alloca [1000 x i32], align 16
+ %b = alloca i32, align 4
+ %d = alloca i32, align 4
+ %aaa = alloca [1000 x i32], align 16
+ %0 = bitcast i32* %f to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %0) #1
+ %1 = bitcast [30 x i32]* %a to i8*
+ call void @llvm.lifetime.start(i64 120, i8* %1) #1
+ %2 = bitcast [1000 x i32]* %aa to i8*
+ call void @llvm.lifetime.start(i64 4000, i8* %2) #1
+ %3 = bitcast i32* %e to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %3) #1
+ %4 = bitcast [1000 x i32]* %cc to i8*
+ call void @llvm.lifetime.start(i64 4000, i8* %4) #1
+ %5 = bitcast i32* %b to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %5) #1
+ %6 = bitcast i32* %d to i8*
+ call void @llvm.lifetime.start(i64 4, i8* %6) #1
+ %7 = bitcast [1000 x i32]* %aaa to i8*
+ call void @llvm.lifetime.start(i64 4000, i8* %7) #1
+ %call = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @check_a to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ %call1 = call i32 ([1000 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([1000 x i32]*, ...)*)([1000 x i32]* %aaa)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call2 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ %call3 = call i32 (i32*, ...) bitcast (i32 (...)* @check_f to i32 (i32*, ...)*)(i32* %f)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call4 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ %call5 = call i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar3 to i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...)*)([1000 x i32]* %aa, [1000 x i32]* %aaa, [1000 x i32]* %cc)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call6 = call i32 ([30 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, [1000 x i32]*, ...)*)([30 x i32]* %a, [1000 x i32]* %cc)
+ %call7 = call i32 (i32*, ...) bitcast (i32 (...)* @check_b to i32 (i32*, ...)*)(i32* %b)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call8 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call9 = call i32 ([30 x i32]*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, i32*, ...)*)([30 x i32]* %a, i32* %f)
+ %call10 = call i32 (i32*, ...) bitcast (i32 (...)* @check_e to i32 (i32*, ...)*)(i32* %e)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call11 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call12 = call i32 (i32*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 (i32*, i32*, ...)*)(i32* %e, i32* %f)
+ %call13 = call i32 (i32*, ...) bitcast (i32 (...)* @check_d to i32 (i32*, ...)*)(i32* %d)
+ %call14 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call15 = call i32 (i32*, i32*, i32*, ...) bitcast (i32 (...)* @bar3 to i32 (i32*, i32*, i32*, ...)*)(i32* %d, i32* %e, i32* %f)
+ call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+ %call16 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+ call void @llvm.lifetime.end(i64 4000, i8* %7) #1
+ call void @llvm.lifetime.end(i64 4, i8* %6) #1
+ call void @llvm.lifetime.end(i64 4, i8* %5) #1
+ call void @llvm.lifetime.end(i64 4000, i8* %4) #1
+ call void @llvm.lifetime.end(i64 4, i8* %3) #1
+ call void @llvm.lifetime.end(i64 4000, i8* %2) #1
+ call void @llvm.lifetime.end(i64 120, i8* %1) #1
+ call void @llvm.lifetime.end(i64 4, i8* %0) #1
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare i32 @check_a(...) #2
+declare i32 @bar1(...) #2
+declare i32 @check_f(...) #2
+declare i32 @bar3(...) #2
+declare i32 @bar2(...) #2
+declare i32 @check_b(...) #2
+declare i32 @check_e(...) #2
+declare i32 @check_d(...) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
diff --git a/test/CodeGen/X86/localescape.ll b/test/CodeGen/X86/localescape.ll
index 3cd174df0b71..10ab8dd9672f 100644
--- a/test/CodeGen/X86/localescape.ll
+++ b/test/CodeGen/X86/localescape.ll
@@ -39,21 +39,19 @@ define void @print_framealloc_from_fp(i8* %fp) {
; X86-LABEL: print_framealloc_from_fp:
; X86: pushl %esi
-; X86: subl $8, %esp
-; X86: movl 16(%esp), %esi
-; X86: movl Lalloc_func$frame_escape_0(%esi), %eax
-; X86: movl %eax, 4(%esp)
-; X86: movl $_str, (%esp)
+; X86: movl 8(%esp), %esi
+; X86: pushl Lalloc_func$frame_escape_0(%esi)
+; X86: pushl $_str
; X86: calll _printf
-; X86: movl Lalloc_func$frame_escape_1(%esi), %eax
-; X86: movl %eax, 4(%esp)
-; X86: movl $_str, (%esp)
+; X86: addl $8, %esp
+; X86: pushl Lalloc_func$frame_escape_1(%esi)
+; X86: pushl $_str
; X86: calll _printf
+; X86: addl $8, %esp
; X86: movl $42, Lalloc_func$frame_escape_1(%esi)
; X86: movl $4, %eax
-; X86: movl Lalloc_func$frame_escape_1(%esi,%eax), %eax
-; X86: movl %eax, 4(%esp)
-; X86: movl $_str, (%esp)
+; X86: pushl Lalloc_func$frame_escape_1(%esi,%eax)
+; X86: pushl $_str
; X86: calll _printf
; X86: addl $8, %esp
; X86: popl %esi
@@ -132,12 +130,12 @@ define void @alloc_func_no_frameaddr() {
; X64: retq
; X86-LABEL: alloc_func_no_frameaddr:
-; X86: subl $12, %esp
-; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 8
-; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 4
-; X86: movl $42, 8(%esp)
-; X86: movl $13, 4(%esp)
-; X86: movl $0, (%esp)
+; X86: subl $8, %esp
+; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4
+; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0
+; X86: movl $42, 4(%esp)
+; X86: movl $13, (%esp)
+; X86: pushl $0
; X86: calll _print_framealloc_from_fp
-; X86: addl $12, %esp
+; X86: addl $12, %esp
; X86: retl
diff --git a/test/CodeGen/X86/lock-inst-encoding.ll b/test/CodeGen/X86/lock-inst-encoding.ll
deleted file mode 100644
index 5ce771f14ab2..000000000000
--- a/test/CodeGen/X86/lock-inst-encoding.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: llc -O0 --show-mc-encoding < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-
-; CHECK-LABEL: f1:
-; CHECK: addq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x01,0x37]
-; CHECK: ret
-define void @f1(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw add i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f2:
-; CHECK: subq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x29,0x37]
-; CHECK: ret
-define void @f2(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw sub i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f3:
-; CHECK: andq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x21,0x37]
-; CHECK: ret
-define void @f3(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw and i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f4:
-; CHECK: orq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x09,0x37]
-; CHECK: ret
-define void @f4(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw or i64* %a, i64 %b monotonic
- ret void
-}
-
-; CHECK-LABEL: f5:
-; CHECK: xorq %{{.*}}, ({{.*}}){{.*}}encoding: [0xf0,0x48,0x31,0x37]
-; CHECK: ret
-define void @f5(i64* %a, i64 %b) nounwind {
- %1 = atomicrmw xor i64* %a, i64 %b monotonic
- ret void
-}
diff --git a/test/CodeGen/X86/loop-blocks.ll b/test/CodeGen/X86/loop-blocks.ll
index a81ceb902ab4..1a1d11e6cb31 100644
--- a/test/CodeGen/X86/loop-blocks.ll
+++ b/test/CodeGen/X86/loop-blocks.ll
@@ -200,6 +200,34 @@ block102:
br label %loop
}
+; CHECK-LABEL: check_minsize:
+; CHECK: jmp .LBB4_1
+; CHECK-NOT: align
+; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: callq loop_latch
+; CHECK-NEXT: .LBB4_1:
+; CHECK-NEXT: callq loop_header
+
+
+define void @check_minsize() minsize nounwind {
+entry:
+ br label %loop
+
+loop:
+ call void @loop_header()
+ %t0 = tail call i32 @get()
+ %t1 = icmp slt i32 %t0, 0
+ br i1 %t1, label %done, label %bb
+
+bb:
+ call void @loop_latch()
+ br label %loop
+
+done:
+ call void @exit()
+ ret void
+}
+
declare void @bar99() nounwind
declare void @bar100() nounwind
declare void @bar101() nounwind
diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll
index 97451e5573fe..3980bee9a306 100644
--- a/test/CodeGen/X86/lsr-static-addr.ll
+++ b/test/CodeGen/X86/lsr-static-addr.ll
@@ -11,8 +11,8 @@
; CHECK-NEXT: incq %rax
-; ATOM: xorl %eax, %eax
; ATOM: movsd .LCPI0_0(%rip), %xmm0
+; ATOM: xorl %eax, %eax
; ATOM: align
; ATOM-NEXT: BB0_2:
; ATOM-NEXT: movsd A(,%rax,8)
diff --git a/test/CodeGen/X86/lzcnt-tzcnt.ll b/test/CodeGen/X86/lzcnt-tzcnt.ll
index aa9ae2b7b100..76e7429ab8da 100644
--- a/test/CodeGen/X86/lzcnt-tzcnt.ll
+++ b/test/CodeGen/X86/lzcnt-tzcnt.ll
@@ -72,39 +72,6 @@ define i64 @test6_ctlz(i64 %v) {
; CHECK-NEXT: ret
-define i16 @test7_ctlz(i16 %v) {
- %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test7_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test8_ctlz(i32 %v) {
- %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test8_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test9_ctlz(i64 %v) {
- %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test9_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test10_ctlz(i16* %ptr) {
%v = load i16, i16* %ptr
%cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
@@ -183,45 +150,6 @@ define i64 @test15_ctlz(i64* %ptr) {
; CHECK-NEXT: ret
-define i16 @test16_ctlz(i16* %ptr) {
- %v = load i16, i16* %ptr
- %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test16_ctlz
-; CHECK-NOT: movw
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test17_ctlz(i32* %ptr) {
- %v = load i32, i32* %ptr
- %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test17_ctlz
-; CHECK-NOT: movd
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test18_ctlz(i64* %ptr) {
- %v = load i64, i64* %ptr
- %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test18_ctlz
-; CHECK-NOT: movq
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test1_cttz(i16 %v) {
%cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
%tobool = icmp eq i16 %v, 0
@@ -288,39 +216,6 @@ define i64 @test6_cttz(i64 %v) {
; CHECK-NEXT: ret
-define i16 @test7_cttz(i16 %v) {
- %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test7_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test8_cttz(i32 %v) {
- %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test8_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test9_cttz(i64 %v) {
- %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test9_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test10_cttz(i16* %ptr) {
%v = load i16, i16* %ptr
%cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
@@ -399,77 +294,6 @@ define i64 @test15_cttz(i64* %ptr) {
; CHECK-NEXT: ret
-define i16 @test16_cttz(i16* %ptr) {
- %v = load i16, i16* %ptr
- %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
- %tobool = icmp eq i16 0, %v
- %cond = select i1 %tobool, i16 %cnt, i16 16
- ret i16 %cond
-}
-; CHECK-LABEL: test16_cttz
-; CHECK-NOT: movw
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test17_cttz(i32* %ptr) {
- %v = load i32, i32* %ptr
- %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
- %tobool = icmp eq i32 0, %v
- %cond = select i1 %tobool, i32 %cnt, i32 32
- ret i32 %cond
-}
-; CHECK-LABEL: test17_cttz
-; CHECK-NOT: movd
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test18_cttz(i64* %ptr) {
- %v = load i64, i64* %ptr
- %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
- %tobool = icmp eq i64 0, %v
- %cond = select i1 %tobool, i64 %cnt, i64 64
- ret i64 %cond
-}
-; CHECK-LABEL: test18_cttz
-; CHECK-NOT: movq
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-define i16 @test1b_ctlz(i16 %v) {
- %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
- %tobool = icmp ne i16 %v, 0
- %cond = select i1 %tobool, i16 16, i16 %cnt
- ret i16 %cond
-}
-; CHECK-LABEL: test1b_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test2b_ctlz(i32 %v) {
- %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
- %tobool = icmp ne i32 %v, 0
- %cond = select i1 %tobool, i32 32, i32 %cnt
- ret i32 %cond
-}
-; CHECK-LABEL: test2b_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test3b_ctlz(i64 %v) {
- %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
- %tobool = icmp ne i64 %v, 0
- %cond = select i1 %tobool, i64 64, i64 %cnt
- ret i64 %cond
-}
-; CHECK-LABEL: test3b_ctlz
-; CHECK: lzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test4b_ctlz(i16 %v) {
%cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
%tobool = icmp ne i16 %v, 0
@@ -503,39 +327,6 @@ define i64 @test6b_ctlz(i64 %v) {
; CHECK-NEXT: ret
-define i16 @test1b_cttz(i16 %v) {
- %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
- %tobool = icmp ne i16 %v, 0
- %cond = select i1 %tobool, i16 16, i16 %cnt
- ret i16 %cond
-}
-; CHECK-LABEL: test1b_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i32 @test2b_cttz(i32 %v) {
- %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
- %tobool = icmp ne i32 %v, 0
- %cond = select i1 %tobool, i32 32, i32 %cnt
- ret i32 %cond
-}
-; CHECK-LABEL: test2b_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
-define i64 @test3b_cttz(i64 %v) {
- %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
- %tobool = icmp ne i64 %v, 0
- %cond = select i1 %tobool, i64 64, i64 %cnt
- ret i64 %cond
-}
-; CHECK-LABEL: test3b_cttz
-; CHECK: tzcnt
-; CHECK-NEXT: ret
-
-
define i16 @test4b_cttz(i16 %v) {
%cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
%tobool = icmp ne i16 %v, 0
diff --git a/test/CodeGen/X86/machine-combiner-int.ll b/test/CodeGen/X86/machine-combiner-int.ll
index 4a1ba1a980ae..df35abd9534d 100644
--- a/test/CodeGen/X86/machine-combiner-int.ll
+++ b/test/CodeGen/X86/machine-combiner-int.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o /dev/null 2>&1 | FileCheck %s --check-prefix=DEAD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o - | FileCheck %s --check-prefix=DEAD
; Verify that integer multiplies are reassociated. The first multiply in
; each test should be independent of the result of the preceding add (lea).
@@ -10,9 +10,12 @@
define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) {
; CHECK-LABEL: reassociate_muls_i16:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill
+; CHECK-NEXT: # kill
; CHECK-NEXT: leal (%rdi,%rsi), %eax
; CHECK-NEXT: imull %ecx, %edx
; CHECK-NEXT: imull %edx, %eax
+; CHECK-NEXT: # kill
; CHECK-NEXT: retq
%t0 = add i16 %x0, %x1
%t1 = mul i16 %x2, %t0
@@ -23,6 +26,8 @@ define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) {
define i32 @reassociate_muls_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-LABEL: reassociate_muls_i32:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill
+; CHECK-NEXT: # kill
; CHECK-NEXT: leal (%rdi,%rsi), %eax
; CHECK-NEXT: imull %ecx, %edx
; CHECK-NEXT: imull %edx, %eax
@@ -60,8 +65,8 @@ define i8 @reassociate_ands_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: andb %cl, %dl
; CHECK-NEXT: andb %dil, %dl
-; CHECK_NEXT: movb %dx, %ax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i8 %x0, %x1
%t1 = and i8 %x2, %t0
%t2 = and i8 %x3, %t1
@@ -76,8 +81,8 @@ define i32 @reassociate_ands_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: andl %ecx, %edx
; CHECK-NEXT: andl %edi, %edx
-; CHECK_NEXT: movl %edx, %eax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i32 %x0, %x1
%t1 = and i32 %x2, %t0
%t2 = and i32 %x3, %t1
@@ -91,7 +96,7 @@ define i64 @reassociate_ands_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-NEXT: andq %rcx, %rdx
; CHECK-NEXT: andq %rdi, %rdx
; CHECK-NEXT: movq %rdx, %rax
-; CHECK_NEXT: retq
+; CHECK-NEXT: retq
%t0 = sub i64 %x0, %x1
%t1 = and i64 %x2, %t0
%t2 = and i64 %x3, %t1
@@ -107,8 +112,8 @@ define i8 @reassociate_ors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: orb %cl, %dl
; CHECK-NEXT: orb %dil, %dl
-; CHECK_NEXT: movb %dx, %ax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i8 %x0, %x1
%t1 = or i8 %x2, %t0
%t2 = or i8 %x3, %t1
@@ -123,8 +128,8 @@ define i32 @reassociate_ors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: orl %edi, %edx
-; CHECK_NEXT: movl %edx, %eax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i32 %x0, %x1
%t1 = or i32 %x2, %t0
%t2 = or i32 %x3, %t1
@@ -138,7 +143,7 @@ define i64 @reassociate_ors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: orq %rdi, %rdx
; CHECK-NEXT: movq %rdx, %rax
-; CHECK_NEXT: retq
+; CHECK-NEXT: retq
%t0 = sub i64 %x0, %x1
%t1 = or i64 %x2, %t0
%t2 = or i64 %x3, %t1
@@ -154,8 +159,8 @@ define i8 @reassociate_xors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: xorb %cl, %dl
; CHECK-NEXT: xorb %dil, %dl
-; CHECK_NEXT: movb %dx, %ax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i8 %x0, %x1
%t1 = xor i8 %x2, %t0
%t2 = xor i8 %x3, %t1
@@ -170,8 +175,8 @@ define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: xorl %ecx, %edx
; CHECK-NEXT: xorl %edi, %edx
-; CHECK_NEXT: movl %edx, %eax
-; CHECK_NEXT: retq
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
%t0 = sub i32 %x0, %x1
%t1 = xor i32 %x2, %t0
%t2 = xor i32 %x3, %t1
@@ -185,7 +190,7 @@ define i64 @reassociate_xors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: xorq %rdi, %rdx
; CHECK-NEXT: movq %rdx, %rax
-; CHECK_NEXT: retq
+; CHECK-NEXT: retq
%t0 = sub i64 %x0, %x1
%t1 = xor i64 %x2, %t0
%t2 = xor i64 %x3, %t1
diff --git a/test/CodeGen/X86/machine-copy-prop.mir b/test/CodeGen/X86/machine-copy-prop.mir
new file mode 100644
index 000000000000..c2cb4ceb7fbe
--- /dev/null
+++ b/test/CodeGen/X86/machine-copy-prop.mir
@@ -0,0 +1,227 @@
+# RUN: llc -march=x86 -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s
+
+--- |
+ declare void @foo()
+ define void @copyprop_remove_kill0() { ret void }
+ define void @copyprop_remove_kill1() { ret void }
+ define void @copyprop_remove_kill2() { ret void }
+ define void @copyprop0() { ret void }
+ define void @copyprop1() { ret void }
+ define void @copyprop2() { ret void }
+ define void @nocopyprop0() { ret void }
+ define void @nocopyprop1() { ret void }
+ define void @nocopyprop2() { ret void }
+ define void @nocopyprop3() { ret void }
+ define void @nocopyprop4() { ret void }
+ define void @nocopyprop5() { ret void }
+...
+---
+# The second copy is redundant and will be removed, check that we also remove
+# the kill flag of intermediate instructions.
+# CHECK-LABEL: name: copyprop_remove_kill0
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %rdi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop_remove_kill0
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %rdi
+ %rdi = COPY %rax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is redundant and will be removed, check that we also remove
+# the kill flag of intermediate instructions.
+# CHECK-LABEL: name: copyprop_remove_kill1
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %edi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop_remove_kill1
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %edi
+ %rdi = COPY %rax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is redundant and will be removed, check that we also remove
+# the kill flag of intermediate instructions.
+# CHECK-LABEL: name: copyprop_remove_kill2
+# CHECK: bb.0:
+# CHECK-NEXT: %ax = COPY %di
+# CHECK-NEXT: NOOP implicit %rdi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop_remove_kill2
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %ax = COPY %di
+ NOOP implicit killed %rdi
+ %di = COPY %ax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is redundant; the call preserves the source and dest register.
+# CHECK-LABEL: name: copyprop0
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64_rt_mostregs
+# CHECK-NEXT: NOOP implicit %edi
+# CHECK-NOT: COPY
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop0
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ CALL64pcrel32 @foo, csr_64_rt_mostregs
+ NOOP implicit killed %edi
+ %rdi = COPY %rax
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The 2nd copy is redundant; The call preserves the source and dest register.
+# CHECK-LABEL: name: copyprop1
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %rax
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop1
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %rax
+ %rax = COPY %rdi
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# CHECK-LABEL: name: copyprop2
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %ax
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64_rt_mostregs
+# CHECK-NOT: %rax = COPY %rdi
+# CHECK-NEXT: NOOP implicit %rax, implicit %rdi
+name: copyprop2
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rdi
+ NOOP implicit killed %ax
+ CALL64pcrel32 @foo, csr_64_rt_mostregs
+ %rax = COPY %rdi
+ NOOP implicit %rax, implicit %rdi
+...
+---
+# The second copy is not redundant if the source register (%rax) is clobbered
+# even if the dest (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop0
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop0
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rbp
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rbp = COPY %rax
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# The second copy is not redundant if the dest register (%rax) is clobbered
+# even if the source (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop1
+# CHECK: bb.0:
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop1
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rbp = COPY %rax
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rax = COPY %rbp
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# The second copy is not redundant if the source register (%rax) is clobbered
+# even if the dest (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop2
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rax = COPY %rbp
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop2
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rbp
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rax = COPY %rbp
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# The second copy is not redundant if the dest register (%rax) is clobbered
+# even if the source (%rbp) is not.
+# CHECK-LABEL: name: nocopyprop3
+# CHECK: bb.0:
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+# CHECK-NEXT: %rbp = COPY %rax
+# CHECK-NEXT: NOOP implicit %rax, implicit %rbp
+name: nocopyprop3
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rbp = COPY %rax
+ CALL64pcrel32 @foo, csr_64, implicit %rax, implicit %rbp
+ %rbp = COPY %rax
+ NOOP implicit %rax, implicit %rbp
+...
+---
+# A reserved register may change its value so the 2nd copy is not redundant.
+# CHECK-LABEL: name: nocopyprop4
+# CHECK: bb.0:
+# CHECK-NEXT: %rax = COPY %rip
+# CHECK-NEXT: NOOP implicit %rax
+# CHECK-NEXT: %rax = COPY %rip
+# CHECK-NEXT: NOOP implicit %rax
+name: nocopyprop4
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rax = COPY %rip
+ NOOP implicit %rax
+ %rax = COPY %rip
+ NOOP implicit %rax
+...
+---
+# Writing to a reserved register may have additional effects (slightly illegal
+# testcase because writing to %rip like this should make the instruction a jump)
+# CHECK-LABEL: name: nocopyprop5
+# CHECK: bb.0:
+# CHECK-NEXT: %rip = COPY %rax
+# CHECK-NEXT: %rip = COPY %rax
+name: nocopyprop5
+allVRegsAllocated: true
+body: |
+ bb.0:
+ %rip = COPY %rax
+ %rip = COPY %rax
+...
diff --git a/test/CodeGen/X86/machine-cp.ll b/test/CodeGen/X86/machine-cp.ll
index 143a1c3787a0..57663a011f10 100644
--- a/test/CodeGen/X86/machine-cp.ll
+++ b/test/CodeGen/X86/machine-cp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+sse2 -verify-machineinstrs < %s | FileCheck %s
; After tail duplication, two copies in an early exit BB can be cancelled out.
; rdar://10640363
diff --git a/test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll b/test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll
new file mode 100644
index 000000000000..16ee6ebbbcdb
--- /dev/null
+++ b/test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -O3 -enable-implicit-null-checks -o - < %s 2>&1 | FileCheck %s
+
+declare void @throw0()
+declare void @throw1()
+
+define i1 @f(i8* %p0, i8* %p1) {
+ entry:
+ %c0 = icmp eq i8* %p0, null
+ br i1 %c0, label %throw0, label %continue0, !make.implicit !0
+
+ continue0:
+ %v0 = load i8, i8* %p0
+ %c1 = icmp eq i8* %p1, null
+ br i1 %c1, label %throw1, label %continue1, !make.implicit !0
+
+ continue1:
+ %v1 = load i8, i8* %p1
+ %v = icmp eq i8 %v0, %v1
+ ret i1 %v
+
+ throw0:
+ call void @throw0()
+ unreachable
+
+ throw1:
+ call void @throw1()
+ unreachable
+}
+
+declare void @foo()
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) nounwind readonly
+
+; Check for a crash. The crash is not specific to statepoints, but
+; gc.statpeoint is an easy way to generate a fill instruction in
+; %continue0 (which causes the llc crash).
+define i1 @g(i8 addrspace(1)* %p0, i8* %p1) gc "statepoint-example" {
+ entry:
+ %c0 = icmp eq i8 addrspace(1)* %p0, null
+ %tok = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @foo, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %p0)
+ %p0.relocated = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 7, i32 7) ; (%p0, %p0)
+ br i1 %c0, label %throw0, label %continue0, !make.implicit !0
+
+ continue0:
+ %c1 = icmp eq i8* %p1, null
+ br i1 %c1, label %throw1, label %continue1, !make.implicit !0
+
+ continue1:
+ %v0 = load i8, i8 addrspace(1)* %p0.relocated
+ %v1 = load i8, i8* %p1
+ %v = icmp eq i8 %v0, %v1
+ ret i1 %v
+
+ throw0:
+ call void @throw0()
+ unreachable
+
+ throw1:
+ call void @throw1()
+ unreachable
+}
+
+; Check that we have two implicit null checks in @f
+
+; CHECK: __LLVM_FaultMaps:
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 1
+
+; FunctionInfo[0] =
+
+; FunctionAddress =
+; CHECK-NEXT: .quad _f
+
+; NumFaultingPCs =
+; CHECK-NEXT: .long 2
+
+; Reserved =
+; CHECK-NEXT: .long 0
+
+!0 = !{}
diff --git a/test/CodeGen/X86/machine-trace-metrics-crash.ll b/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 048260c51fe3..5b7c5445316c 100644
--- a/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -51,10 +51,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: 1)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!1 = !DIFile(filename: "24199.cpp", directory: "/bin")
!2 = !{i32 2, !"Debug Info Version", i32 3}
-!3 = distinct !DISubprogram(linkageName: "foo", file: !1, line: 18, isLocal: false, isDefinition: true, scopeLine: 18)
+!3 = distinct !DISubprogram(linkageName: "foo", file: !1, line: 18, isLocal: false, isDefinition: true, scopeLine: 18, unit: !0)
!4 = !DIExpression()
!5 = !DILocalVariable(name: "this", arg: 1, scope: !3, flags: DIFlagArtificial | DIFlagObjectPointer)
!6 = !DILocation(line: 0, scope: !3)
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index b7280d87d3b7..3b748eeb2e5a 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1,10 +1,9 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
-
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -266,7 +265,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %ymm2, %ymm0
; SKX-NEXT: retq
%a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
@@ -279,8 +278,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
;
; KNL_64-LABEL: test7:
; KNL_64: # BB#0:
-; KNL_64-NEXT: movzbl %sil, %eax
-; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
@@ -292,7 +290,8 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_32-LABEL: test7:
; KNL_32: # BB#0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
@@ -306,7 +305,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; SKX-NEXT: kmovb %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
-; SKX-NEXT: vmovaps %zmm1, %zmm2
+; SKX-NEXT: vmovaps %ymm1, %ymm2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
; SKX-NEXT: retq
@@ -405,9 +404,9 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -421,10 +420,10 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -434,10 +433,10 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; SKX: # BB#0: # %entry
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -467,9 +466,9 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
-; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -483,10 +482,10 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
-; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -496,10 +495,10 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; SKX: # BB#0: # %entry
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
@@ -638,8 +637,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
-; SKX-NEXT: vmovd %esi, %xmm1
-; SKX-NEXT: vpbroadcastd %xmm1, %ymm1
+; SKX-NEXT: vpbroadcastd %esi, %ymm1
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
@@ -677,42 +675,42 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
;
; KNL_64-LABEL: test15:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_64: vpxor %ymm2, %ymm2, %ymm2
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
-; KNL_64-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0
+; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
+; KNL_64-NEXT: # kill
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL_32: vpxor %ymm2, %ymm2, %ymm2
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
-; KNL_32-NEXT: vpsllvq .LCPI14_0, %zmm0, %zmm0
-; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0
+; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
+; KNL_32-NEXT: # kill
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test15:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -726,7 +724,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
;
; KNL_64-LABEL: test16:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -740,7 +738,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
;
; KNL_32-LABEL: test16:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -756,18 +754,18 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; SKX-LABEL: test16:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %ymm2, %ymm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test16:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %ymm2, %ymm0
; SKX_32-NEXT: retl
%sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -780,7 +778,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
;
; KNL_64-LABEL: test17:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -790,7 +788,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
;
; KNL_32-LABEL: test17:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1
@@ -802,18 +800,18 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
; SKX-LABEL: test17:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test17:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
@@ -832,36 +830,34 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
;
; KNL_64-LABEL: test18:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_64: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
-; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test18:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL_32: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_32-NEXT: vpsllvq .LCPI17_0, %zmm2, %zmm2
-; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test18:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovd2m %xmm2, %k1
+; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
@@ -872,7 +868,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
;
; KNL_64-LABEL: test19:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -884,7 +880,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
;
; KNL_32-LABEL: test19:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
@@ -898,14 +894,14 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
; SKX-LABEL: test19:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX-NEXT: vpmovd2m %xmm1, %k1
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test19:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovd2m %xmm1, %k1
+; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
; SKX_32-NEXT: retl
@@ -919,36 +915,34 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
;
; KNL_64-LABEL: test20:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
-; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test20:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
-; KNL_32-NEXT: vpsllvq .LCPI19_0, %zmm2, %zmm2
-; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
+; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
; SKX: # BB#0:
-; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vpmovq2m %xmm2, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX: vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX-NEXT: kshiftlb $6, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
;
@@ -956,9 +950,9 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovq2m %xmm2, %k0
-; SKX_32-NEXT: kshiftlw $2, %k0, %k0
-; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX_32-NEXT: kshiftlb $6, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
@@ -970,7 +964,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
;
; KNL_64-LABEL: test21:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
@@ -980,7 +974,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
;
; KNL_32-LABEL: test21:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2
@@ -990,20 +984,20 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
;
; SKX-LABEL: test21:
; SKX: # BB#0:
-; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vpmovq2m %xmm2, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX: vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX-NEXT: kshiftlb $6, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test21:
; SKX_32: # BB#0:
-; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovq2m %xmm2, %k0
-; SKX_32-NEXT: kshiftlw $2, %k0, %k0
-; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32: vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
+; SKX_32-NEXT: kshiftlb $6, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: retl
@@ -1019,31 +1013,29 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
;
; KNL_64-LABEL: test22:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_64: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
; KNL_64-NEXT: vmovaps %zmm2, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
-; KNL_32-NEXT: vpsllvq .LCPI21_0, %zmm1, %zmm1
-; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
; KNL_32-NEXT: vmovaps %zmm2, %zmm0
; KNL_32-NEXT: retl
@@ -1052,23 +1044,23 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX: # BB#0:
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
+; SKX-NEXT: kshiftlb $6, %k0, %k0
+; SKX-NEXT: kshiftrb $6, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test22:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k0
-; SKX_32-NEXT: kshiftlw $2, %k0, %k0
-; SKX_32-NEXT: kshiftrw $2, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
+; SKX_32-NEXT: kshiftlb $6, %k0, %k0
+; SKX_32-NEXT: kshiftrb $6, %k0, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
@@ -1083,7 +1075,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
;
; KNL_64-LABEL: test23:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1093,7 +1085,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
;
; KNL_32-LABEL: test23:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1
@@ -1105,18 +1097,18 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
; SKX-LABEL: test23:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
@@ -1127,8 +1119,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test24:
; KNL_64: # BB#0:
-; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
@@ -1136,7 +1127,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
;
; KNL_32-LABEL: test24:
; KNL_32: # BB#0:
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1
@@ -1149,7 +1140,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test24:
@@ -1157,7 +1148,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
@@ -1169,7 +1160,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
;
; KNL_64-LABEL: test25:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_64: vpxord %zmm3, %zmm3, %zmm3
; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1179,7 +1170,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
;
; KNL_32-LABEL: test25:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; KNL_32: vpxord %zmm3, %zmm3, %zmm3
; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1
@@ -1191,18 +1182,18 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
; SKX-LABEL: test25:
; SKX: # BB#0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vpmovq2m %xmm1, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
-; SKX-NEXT: vmovaps %zmm2, %zmm0
+; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test25:
; SKX_32: # BB#0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vpmovq2m %xmm1, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
-; SKX_32-NEXT: vmovaps %zmm2, %zmm0
+; SKX_32-NEXT: vmovaps %xmm2, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
@@ -1214,8 +1205,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
;
; KNL_64-LABEL: test26:
; KNL_64: # BB#0:
-; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
+; KNL_64: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
@@ -1223,7 +1213,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
;
; KNL_32-LABEL: test26:
; KNL_32: # BB#0:
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2
@@ -1236,7 +1226,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; SKX: # BB#0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test26:
@@ -1244,7 +1234,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
-; SKX_32-NEXT: vmovaps %zmm1, %zmm0
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
@@ -1260,9 +1250,9 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: # kill
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
@@ -1271,9 +1261,9 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: movb $3, %cl
-; KNL_32-NEXT: movzbl %cl, %ecx
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: # kill
; KNL_32-NEXT: retl
;
; SKX-LABEL: test27:
@@ -1295,16 +1285,15 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
;
; KNL_64-LABEL: test28:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: movb $3, %al
-; KNL_64-NEXT: movzbl %al, %eax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test28:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2
@@ -1314,7 +1303,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
;
; SKX-LABEL: test28:
; SKX: # BB#0:
-; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovb %eax, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
@@ -1322,7 +1311,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
;
; SKX_32-LABEL: test28:
; SKX_32: # BB#0:
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: movb $3, %al
; SKX_32-NEXT: kmovb %eax, %k1
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
@@ -1381,12 +1370,9 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; KNL_64-LABEL: test30:
; KNL_64: # BB#0:
; KNL_64-NEXT: andl $1, %edx
-; KNL_64-NEXT: kmovw %edx, %k1
; KNL_64-NEXT: andl $1, %esi
-; KNL_64-NEXT: kmovw %esi, %k2
; KNL_64-NEXT: movl %edi, %eax
; KNL_64-NEXT: andl $1, %eax
-; KNL_64-NEXT: kmovw %eax, %k0
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
@@ -1394,102 +1380,97 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; KNL_64-NEXT: testb $1, %dil
; KNL_64-NEXT: je .LBB29_2
; KNL_64-NEXT: # BB#1: # %cond.load
-; KNL_64-NEXT: vmovq %xmm1, %rax
-; KNL_64-NEXT: vmovd (%rax), %xmm0
+; KNL_64-NEXT: vmovq %xmm1, %rcx
+; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL_64-NEXT: .LBB29_2: # %else
-; KNL_64-NEXT: kmovw %k2, %eax
-; KNL_64-NEXT: movl %eax, %ecx
-; KNL_64-NEXT: andl $1, %ecx
-; KNL_64-NEXT: testb %cl, %cl
+; KNL_64-NEXT: testb %sil, %sil
; KNL_64-NEXT: je .LBB29_4
; KNL_64-NEXT: # BB#3: # %cond.load1
; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
; KNL_64-NEXT: .LBB29_4: # %else2
-; KNL_64-NEXT: kmovw %k1, %ecx
-; KNL_64-NEXT: movl %ecx, %edx
-; KNL_64-NEXT: andl $1, %edx
; KNL_64-NEXT: testb %dl, %dl
; KNL_64-NEXT: je .LBB29_6
; KNL_64-NEXT: # BB#5: # %cond.load4
; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
-; KNL_64-NEXT: vmovq %xmm1, %rdx
-; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
+; KNL_64-NEXT: vmovq %xmm1, %rcx
+; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0
; KNL_64-NEXT: .LBB29_6: # %else5
-; KNL_64-NEXT: kmovw %k0, %edx
-; KNL_64-NEXT: vmovd %edx, %xmm1
-; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_64-NEXT: vmovd %eax, %xmm1
+; KNL_64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
+; KNL_64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test30:
; KNL_32: # BB#0:
+; KNL_32-NEXT: pushl %ebx
+; KNL_32-NEXT: .Ltmp0:
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: pushl %esi
+; KNL_32-NEXT: .Ltmp1:
+; KNL_32-NEXT: .cfi_def_cfa_offset 12
+; KNL_32-NEXT: .Ltmp2:
+; KNL_32-NEXT: .cfi_offset %esi, -12
+; KNL_32-NEXT: .Ltmp3:
+; KNL_32-NEXT: .cfi_offset %ebx, -8
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: andl $1, %eax
-; KNL_32-NEXT: kmovw %eax, %k1
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: andl $1, %eax
-; KNL_32-NEXT: kmovw %eax, %k2
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: movl %eax, %ecx
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; KNL_32-NEXT: andl $1, %ecx
-; KNL_32-NEXT: kmovw %ecx, %k0
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; KNL_32-NEXT: movl %ebx, %edx
+; KNL_32-NEXT: andl $1, %edx
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; KNL_32-NEXT: # implicit-def: %XMM0
-; KNL_32-NEXT: testb $1, %al
+; KNL_32-NEXT: testb $1, %bl
; KNL_32-NEXT: je .LBB29_2
; KNL_32-NEXT: # BB#1: # %cond.load
-; KNL_32-NEXT: vmovd %xmm1, %eax
-; KNL_32-NEXT: vmovd (%eax), %xmm0
+; KNL_32-NEXT: vmovd %xmm1, %esi
+; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL_32-NEXT: .LBB29_2: # %else
-; KNL_32-NEXT: kmovw %k2, %eax
-; KNL_32-NEXT: movl %eax, %ecx
-; KNL_32-NEXT: andl $1, %ecx
; KNL_32-NEXT: testb %cl, %cl
; KNL_32-NEXT: je .LBB29_4
; KNL_32-NEXT: # BB#3: # %cond.load1
-; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
-; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
+; KNL_32-NEXT: vpextrd $1, %xmm1, %esi
+; KNL_32-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0
; KNL_32-NEXT: .LBB29_4: # %else2
-; KNL_32-NEXT: kmovw %k1, %ecx
-; KNL_32-NEXT: movl %ecx, %edx
-; KNL_32-NEXT: andl $1, %edx
-; KNL_32-NEXT: testb %dl, %dl
+; KNL_32-NEXT: testb %al, %al
; KNL_32-NEXT: je .LBB29_6
; KNL_32-NEXT: # BB#5: # %cond.load4
-; KNL_32-NEXT: vpextrd $2, %xmm1, %edx
-; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0
+; KNL_32-NEXT: vpextrd $2, %xmm1, %esi
+; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0
; KNL_32-NEXT: .LBB29_6: # %else5
-; KNL_32-NEXT: kmovw %k0, %edx
; KNL_32-NEXT: vmovd %edx, %xmm1
-; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_32-NEXT: popl %esi
+; KNL_32-NEXT: popl %ebx
; KNL_32-NEXT: retl
;
; SKX-LABEL: test30:
; SKX: # BB#0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX-NEXT: vpmovd2m %xmm2, %k1
+; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SKX-NEXT: # implicit-def: %XMM0
-; SKX-NEXT: andb $1, %al
+; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_2
; SKX-NEXT: # BB#1: # %cond.load
; SKX-NEXT: vmovq %xmm1, %rax
-; SKX-NEXT: vmovd (%rax), %xmm0
+; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX-NEXT: .LBB29_2: # %else
; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SKX-NEXT: andb $1, %al
+; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_4
; SKX-NEXT: # BB#3: # %cond.load1
; SKX-NEXT: vpextrq $1, %xmm1, %rax
@@ -1497,15 +1478,14 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; SKX-NEXT: .LBB29_4: # %else2
; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SKX-NEXT: andb $1, %al
+; SKX-NEXT: testb %al, %al
; SKX-NEXT: je .LBB29_6
; SKX-NEXT: # BB#5: # %cond.load4
-; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
+; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm1
; SKX-NEXT: vmovq %xmm1, %rax
; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
; SKX-NEXT: .LBB29_6: # %else5
-; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
-; SKX-NEXT: vmovaps %zmm3, %zmm0
+; SKX-NEXT: vpblendmd %xmm0, %xmm3, %xmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test30:
@@ -1514,36 +1494,36 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
; SKX_32-NEXT: .Ltmp0:
; SKX_32-NEXT: .cfi_def_cfa_offset 16
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
-; SKX_32-NEXT: vpmovd2m %xmm2, %k1
+; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
-; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: # implicit-def: %XMM1
-; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: # implicit-def: %XMM0
+; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_2
; SKX_32-NEXT: # BB#1: # %cond.load
-; SKX_32-NEXT: vmovd %xmm2, %eax
-; SKX_32-NEXT: vmovd (%eax), %xmm1
+; SKX_32-NEXT: vmovd %xmm1, %eax
+; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SKX_32-NEXT: .LBB29_2: # %else
; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp)
; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al
-; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_4
; SKX_32-NEXT: # BB#3: # %cond.load1
-; SKX_32-NEXT: vpextrd $1, %xmm2, %eax
-; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: vpextrd $1, %xmm1, %eax
+; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: .LBB29_4: # %else2
-; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm0
+; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm2
; SKX_32-NEXT: kmovb %k1, (%esp)
; SKX_32-NEXT: movb (%esp), %al
-; SKX_32-NEXT: andb $1, %al
+; SKX_32-NEXT: testb %al, %al
; SKX_32-NEXT: je .LBB29_6
; SKX_32-NEXT: # BB#5: # %cond.load4
-; SKX_32-NEXT: vpextrd $2, %xmm2, %eax
-; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: vpextrd $2, %xmm1, %eax
+; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
; SKX_32-NEXT: .LBB29_6: # %else5
-; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; SKX_32-NEXT: vpblendmd %xmm0, %xmm2, %xmm0 {%k1}
; SKX_32-NEXT: addl $12, %esp
; SKX_32-NEXT: retl
@@ -1660,12 +1640,12 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-LABEL: test_gather_16i64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp0:
+; KNL_32-NEXT: .Ltmp4:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp1:
+; KNL_32-NEXT: .Ltmp5:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp2:
+; KNL_32-NEXT: .Ltmp6:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -1783,12 +1763,12 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-LABEL: test_gather_16f64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp3:
+; KNL_32-NEXT: .Ltmp7:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp4:
+; KNL_32-NEXT: .Ltmp8:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp5:
+; KNL_32-NEXT: .Ltmp9:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -1900,12 +1880,12 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; KNL_32-LABEL: test_scatter_16i64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp6:
+; KNL_32-NEXT: .Ltmp10:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp7:
+; KNL_32-NEXT: .Ltmp11:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp8:
+; KNL_32-NEXT: .Ltmp12:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -2014,12 +1994,12 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_32-LABEL: test_scatter_16f64:
; KNL_32: # BB#0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Ltmp9:
+; KNL_32-NEXT: .Ltmp13:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Ltmp10:
+; KNL_32-NEXT: .Ltmp14:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Ltmp11:
+; KNL_32-NEXT: .Ltmp15:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index c29933e266b2..e3657d67ad0e 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -1,379 +1,1562 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
-; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
-; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
-
-; AVX512-LABEL: test1
-; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-
-; AVX2-LABEL: test1
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2-NOT: blend
-
-; AVX_SCALAR-LABEL: test1
-; AVX_SCALAR-NOT: masked
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: insertelement
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: insertelement
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX
+
+; To test for the case where masked load/store is not legal, we should add a run with a target
+; that does not have AVX, but that case should probably be a separate test file using less tests
+; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
+
define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
+; AVX1-LABEL: test1:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test1:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test1:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
+ %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
ret <16 x i32> %res
}
-; AVX512-LABEL: test2
-; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-
-; AVX2-LABEL: test2
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2-NOT: blend
define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
+; AVX1-LABEL: test2:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test2:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test2:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
+ %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
ret <16 x i32> %res
}
-; AVX512-LABEL: test3
-; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
-
-; AVX_SCALAR-LABEL: test3
-; AVX_SCALAR-NOT: masked
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: store
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: store
-; AVX_SCALAR: extractelement
-; AVX_SCALAR: store
define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
+; AVX1-LABEL: test3:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test3:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovd %ymm3, %ymm1, 32(%rdi)
+; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test3:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
+ call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
ret void
}
-; AVX512-LABEL: test4
-; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}}
-
-; AVX2-LABEL: test4
-; AVX2: vmaskmovps {{.*}}(%rdi)
-; AVX2: vmaskmovps {{.*}}(%rdi)
-; AVX2: blend
define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
+; AVX1-LABEL: test4:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm4
+; AVX1-NEXT: vblendvps %ymm0, %ymm4, %ymm2, %ymm0
+; AVX1-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm2
+; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test4:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vblendvps %ymm0, %ymm4, %ymm2, %ymm0
+; AVX2-NEXT: vmaskmovps 32(%rdi), %ymm1, %ymm2
+; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vmovups (%rdi), %zmm1 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
+ %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
ret <16 x float> %res
}
-; AVX512-LABEL: test5
-; AVX512: vmovupd (%rdi), %zmm1 {%k1}
-
-; AVX2-LABEL: test5
-; AVX2: vmaskmovpd
-; AVX2: vblendvpd
-; AVX2: vmaskmovpd
-; AVX2: vblendvpd
define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
+; AVX1-LABEL: test5:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test5:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
+; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test5:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test5:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
- %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
+ %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
ret <8 x double> %res
}
-; AVX2-LABEL: test6
-; AVX2: vmaskmovpd
-; AVX2: vblendvpd
-
-; SKX-LABEL: test6
-; SKX: vmovupd {{.*}}{%k1}
define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
+; AVX-LABEL: test6:
+; AVX: ## BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
+; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test6:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test6:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovupd (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
- %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
+ %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
ret <2 x double> %res
}
-; AVX2-LABEL: test7
-; AVX2: vmaskmovps {{.*}}(%rdi)
-; AVX2: blend
-
-; SKX-LABEL: test7
-; SKX: vmovups (%rdi){{.*}}{%k1}
define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
+; AVX-LABEL: test7:
+; AVX: ## BB#0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test7:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test7:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
ret <4 x float> %res
}
-; AVX2-LABEL: test8
-; AVX2: vpmaskmovd {{.*}}(%rdi)
-; AVX2: blend
-
-; SKX-LABEL: test8
-; SKX: vmovdqu32 (%rdi){{.*}}{%k1}
define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
+; AVX1-LABEL: test8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
+ %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
ret <4 x i32> %res
}
-; AVX2-LABEL: test9
-; AVX2: vpmaskmovd %xmm
-
-; SKX-LABEL: test9
-; SKX: vmovdqu32 %xmm{{.*}}{%k1}
define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+; AVX1-LABEL: test9:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test9:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test9:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test9:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
ret void
}
-; AVX2-LABEL: test10
-; AVX2: vmaskmovpd (%rdi), %ymm
-; AVX2: blend
-
-; SKX-LABEL: test10
-; SKX: vmovapd {{.*}}{%k1}
define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+; AVX1-LABEL: test10:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test10:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test10:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2
+; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test10:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; SKX-NEXT: vmovapd (%rdi), %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
ret <4 x double> %res
}
-; AVX2-LABEL: test11a
-; AVX2: vmaskmovps
-; AVX2: vblendvps
+define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
+; AVX1-LABEL: test10b:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test10b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test10b:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test10b:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %mask = icmp eq <4 x i32> %trigger, zeroinitializer
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>zeroinitializer)
+ ret <4 x double> %res
+}
-; SKX-LABEL: test11a
-; SKX: vmovaps (%rdi), %ymm1 {%k1}
-; AVX512-LABEL: test11a
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovups (%rdi), %zmm1 {%k1}
define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
+; AVX1-LABEL: test11a:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11a:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11a:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11a:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vmovaps (%rdi), %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
- %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
+ %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
ret <8 x float> %res
}
-; SKX-LABEL: test11b
-; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
-; AVX512-LABEL: test11b
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
- %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
+; AVX1-LABEL: test11b:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
+; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11b:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2
+; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11b:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11b:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
+; SKX-NEXT: retq
+ %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
ret <8 x i32> %res
}
-; SKX-LABEL: test11c
-; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
-; AVX512-LABEL: test11c
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
- %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
+; AVX1-LABEL: test11c:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11c:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11c:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11c:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
ret <8 x float> %res
}
-; SKX-LABEL: test11d
-; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
-; AVX512-LABEL: test11d
-; AVX512: kshiftlw $8
-; AVX512: kshiftrw $8
-; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
- %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
+; AVX1-LABEL: test11d:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test11d:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test11d:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test11d:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
ret <8 x i32> %res
}
-; AVX2-LABEL: test12
-; AVX2: vpmaskmovd %ymm
-
-; SKX-LABEL: test12
-; SKX: vmovdqu32 {{.*}}{%k1}
define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
+; AVX1-LABEL: test12:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test12:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test12:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test12:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
+ call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
ret void
}
-; AVX512-LABEL: test13
-; AVX512: vmovups %zmm1, (%rdi) {%k1}
-
define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
+; AVX1-LABEL: test13:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test13:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vmaskmovps %ymm3, %ymm1, 32(%rdi)
+; AVX2-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test13:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
+ call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
ret void
}
-; AVX2-LABEL: test14
-; AVX2: vpshufd
-; AVX2: vmovq
-; AVX2: vmaskmovps
-
-; SKX-LABEL: test14
-; SKX: kshiftl
-; SKX: kshiftr
-; SKX: vmovups {{.*}}{%k1}
-
define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
+; AVX1-LABEL: test14:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test14:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test14:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test14:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
+; SKX-NEXT: vmovups %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
+ call void @llvm.masked.store.v2f32.p0v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
ret void
}
-; AVX2-LABEL: test15
-; AVX2: vpmaskmovd
-
+define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
+; AVX1-LABEL: test15:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test15:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test15:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test15:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
-define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
+ call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
ret void
}
-; AVX2-LABEL: test16
-; AVX2: vmaskmovps
-; AVX2: vblendvps
-
-; SKX-LABEL: test16
-; SKX: kshiftl
-; SKX: kshiftr
-; SKX: vmovups {{.*}}{%k1}
define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
+; AVX1-LABEL: test16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
+ %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
ret <2 x float> %res
}
-; AVX2-LABEL: test17
-; AVX2: vpmaskmovd
-; AVX2: vblendvps
-; AVX2: vpmovsxdq
-
-; SKX-LABEL: test17
-; SKX: kshiftl
-; SKX: kshiftr
-; SKX: vmovdqu32 {{.*}}{%k1}
define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
+; AVX1-LABEL: test17:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test17:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test17:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test17:
+; SKX: ## BB#0:
+; SKX-NEXT: vpxord %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k0
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
+; SKX-NEXT: vpmovsxdq %xmm0, %xmm0
+; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
+ %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
ret <2 x i32> %res
}
-; AVX2-LABEL: test18
-; AVX2: vmaskmovps
-; AVX2-NOT: blend
-; AVX2: ret
define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
+; AVX1-LABEL: test18:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test18:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test18:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test18:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpxord %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
-; SKX-NEXT: kshiftlw $2, %k0, %k0
-; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: kshiftlw $14, %k0, %k0
+; SKX-NEXT: kshiftrw $14, %k0, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
- %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
+ %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
ret <2 x float> %res
}
-; AVX_SCALAR-LABEL: test19
-; AVX_SCALAR: load <4 x float>, <4 x float>* %addr, align 4
-
-define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) {
+define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
+; AVX-LABEL: load_all:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: load_all:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: load_all:
+; SKX: ## BB#0:
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
ret <4 x float> %res
}
-; AVX_SCALAR-LABEL: test20
-; AVX_SCALAR: load float, {{.*}}, align 4
-; AVX_SCALAR: insertelement <4 x float> undef, float
-; AVX_SCALAR: select <4 x i1> <i1 true, i1 false, i1 true, i1 true>
+;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.
-define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) {
- %mask = icmp eq <4 x i32> %trigger, zeroinitializer
- %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 16, <4 x i1><i1 true, i1 false, i1 true, i1 true>, <4 x float> %src0)
+; 128-bit FP vectors are supported with AVX.
+
+define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
+; AVX-LABEL: mload_constmask_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4f32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295]
+; AVX512F-NEXT: vmaskmovps (%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4f32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $13, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
ret <4 x float> %res
}
-; AVX_SCALAR-LABEL: test21
-; AVX_SCALAR: store <4 x i32> %val
+; 128-bit integer vectors are supported with AVX2.
+
+define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
+; AVX1-LABEL: mload_constmask_v4i32:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v4i32:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4i32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
+; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4i32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $14, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
+ ret <4 x i32> %res
+}
+
+; 256-bit FP vectors are supported with AVX.
+
+define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
+; AVX-LABEL: mload_constmask_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
+; AVX-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v8f32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: movw $7, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v8f32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovups (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
+ ret <8 x float> %res
+}
+
+define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
+; AVX-LABEL: mload_constmask_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm2
+; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4f64:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
+ ret <4 x double> %res
+}
+
+; 256-bit integer vectors are supported with AVX2.
+
+define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
+; AVX1-LABEL: mload_constmask_v8i32:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v8i32:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v8i32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: movw $135, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v8i32:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $-121, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
+ ret <8 x i32> %res
+}
+
+define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
+; AVX1-LABEL: mload_constmask_v4i64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = mem[0],ymm0[1,2],mem[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v4i64:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]
+; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm2
+; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4i64:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $9, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
+; SKX-NEXT: retq
+ %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
+ ret <4 x i64> %res
+}
+
+; 512-bit FP vectors are supported with AVX512.
+
+define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
+; AVX-LABEL: mload_constmask_v8f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],mem[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3]
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: mload_constmask_v8f64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: movb $-121, %al
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vmovupd (%rdi), %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
+ ret <8 x double> %res
+}
+
+; If the pass-through operand is undef, no blend is needed.
+
+define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
+; AVX-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
+; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $7, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
+ ret <4 x double> %res
+}
+
+define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
+; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
+; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
+; SKX: ## BB#0:
+; SKX-NEXT: movb $6, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
+ ret <4 x i64> %res
+}
+
define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
+; AVX1-LABEL: test21:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test21:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test21:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test21:
+; SKX: ## BB#0:
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
ret void
}
-; AVX_SCALAR-LABEL: test22
-; AVX_SCALAR: extractelement <4 x i32> %val, i32 0
-; AVX_SCALAR: store i32
-define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
- %mask = icmp eq <4 x i32> %trigger, zeroinitializer
- call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+; When only one element of the mask is set, reduce to a scalar store.
+
+define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
+; AVX-LABEL: one_mask_bit_set1:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: one_mask_bit_set1:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vmovd %xmm0, (%rdi)
+; AVX512-NEXT: retq
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+ ret void
+}
+
+; Choose a different element to show that the correct address offset is produced.
+
+define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
+; AVX-LABEL: one_mask_bit_set2:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: one_mask_bit_set2:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi)
+; AVX512-NEXT: retq
+ call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
+; AVX-LABEL: one_mask_bit_set3:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: one_mask_bit_set3:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovq %xmm0, 16(%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: one_mask_bit_set3:
+; SKX: ## BB#0:
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vmovq %xmm0, 16(%rdi)
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
-declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
-declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
-declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
-declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
-declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
-declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
-declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
-declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
-declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
-declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
-declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
-declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
-declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
-declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
-declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
-
-declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
-
-; AVX512-LABEL: test23
-; AVX512: vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
-; AVX512: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
+; AVX-LABEL: one_mask_bit_set4:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: one_mask_bit_set4:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: one_mask_bit_set4:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vmovhpd %xmm0, 24(%rdi)
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
+ ret void
+}
+
+; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
+
+define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
+; AVX-LABEL: one_mask_bit_set5:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX-NEXT: vmovlps %xmm0, 48(%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: one_mask_bit_set5:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi)
+; AVX512-NEXT: retq
+ call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
+ ret void
+}
+
+; When only one element of the mask is set, reduce to a scalar load.
+
+define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
+; AVX-LABEL: load_one_mask_bit_set1:
+; AVX: ## BB#0:
+; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: load_one_mask_bit_set1:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
+ ret <4 x i32> %res
+}
+
+; Choose a different element to show that the correct address offset is produced.
+
+define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
+; AVX-LABEL: load_one_mask_bit_set2:
+; AVX: ## BB#0:
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: load_one_mask_bit_set2:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512-NEXT: retq
+ %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
+ ret <4 x float> %res
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
+; AVX1-LABEL: load_one_mask_bit_set3:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_one_mask_bit_set3:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_one_mask_bit_set3:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: load_one_mask_bit_set3:
+; SKX: ## BB#0:
+; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: retq
+ %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
+ ret <4 x i64> %res
+}
+
+; Choose a different scalar type and a high element of a 256-bit vector because AVX doesn't support those evenly.
+
+define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
+; AVX-LABEL: load_one_mask_bit_set4:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: load_one_mask_bit_set4:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: load_one_mask_bit_set4:
+; SKX: ## BB#0:
+; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: retq
+ %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
+ ret <4 x double> %res
+}
+
+; Try a 512-bit vector to make sure AVX doesn't die and AVX512 works as expected.
+
+define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
+; AVX-LABEL: load_one_mask_bit_set5:
+; AVX: ## BB#0:
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: load_one_mask_bit_set5:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
+ ret <8 x double> %res
+}
+
+declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
+declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f32.p0v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
+declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
+declare <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
+declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4f64.p0v4f64(<4 x double>, <4 x double>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
+
+declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
+; AVX1-LABEL: test23:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm3, %ymm3
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm2
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm1
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test23:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm3, %ymm3
+; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm2, %ymm2
+; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test23:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; AVX512-NEXT: vpcmpeqq %zmm2, %zmm1, %k2
+; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512-NEXT: retq
%mask = icmp eq <16 x i32*> %trigger, zeroinitializer
- %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
+ %res = call <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
ret <16 x i32*> %res
}
%mystruct = type { i16, i16, [1 x i8*] }
-declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
+declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
-; AVX512-LABEL: test24:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX1-LABEL: test24:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm1, %ymm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm1, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm0, %ymm1
+; AVX1-NEXT: vmovapd %ymm4, %ymm0
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test24:
; AVX2: ## BB#0:
@@ -403,6 +1586,16 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
; AVX2-NEXT: vmovdqa %ymm4, %ymm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test24:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test24:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -411,20 +1604,50 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
; SKX-NEXT: retq
- %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
+ %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
ret <16 x %mystruct*> %res
}
define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
-; AVX512-LABEL: test_store_16i64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_store_16i64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_store_16i64:
; AVX2: ## BB#0:
@@ -454,6 +1677,16 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_store_16i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_store_16i64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -462,20 +1695,51 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
; SKX-NEXT: retq
- call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
+ call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
ret void
}
-declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
+
define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
-; AVX512-LABEL: test_store_16f64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovupd %zmm1, (%rdi) {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_store_16f64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi)
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_store_16f64:
; AVX2: ## BB#0:
@@ -505,6 +1769,16 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_store_16f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_store_16f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -513,22 +1787,55 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl
; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
; SKX-NEXT: retq
- call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
+ call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
ret void
}
-declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
+
define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
-; AVX512-LABEL: test_load_16i64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm2, %zmm1
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_load_16i64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX1-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX1-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vmovapd %ymm5, %ymm0
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_16i64:
; AVX2: ## BB#0:
@@ -536,22 +1843,22 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
-; AVX2-NEXT: vpmaskmovq (%rdi), %ymm5, %ymm9
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
-; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
-; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
-; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm7, %ymm8
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
-; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
-; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
-; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm6, %ymm10
-; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
-; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT: vpmaskmovq (%rdi), %ymm5, %ymm6
+; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm1, %ymm6
+; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm2, %ymm6
+; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -562,6 +1869,18 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; AVX2-NEXT: vmovapd %ymm5, %ymm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_load_16i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_load_16i64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -572,22 +1891,55 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: vmovaps %zmm2, %zmm1
; SKX-NEXT: retq
- %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+ %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
}
-declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+
define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
-; AVX512-LABEL: test_load_16f64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k1}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm2, %zmm1
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_load_16f64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
+; AVX1-NEXT: vpsrad $31, %xmm5, %xmm5
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm5, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX1-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX1-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
+; AVX1-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX1-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3
+; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vmovapd %ymm5, %ymm0
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_16f64:
; AVX2: ## BB#0:
@@ -595,22 +1947,22 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
-; AVX2-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm9
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
-; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
-; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
-; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm7, %ymm8
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
-; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
-; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
-; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm10
-; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
-; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
+; AVX2-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm6
+; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm1, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm1, %ymm6
+; AVX2-NEXT: vblendvpd %ymm1, %ymm6, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm2, %ymm6
+; AVX2-NEXT: vblendvpd %ymm2, %ymm6, %ymm3, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -621,6 +1973,18 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; AVX2-NEXT: vmovapd %ymm5, %ymm0
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_load_16f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_load_16f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -631,32 +1995,117 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: vmovaps %zmm2, %zmm1
; SKX-NEXT: retq
- %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+ %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
}
-declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
-; AVX512-LABEL: test_load_32f64:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5
-; AVX512-NEXT: vpslld $31, %zmm5, %zmm5
-; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k1
-; AVX512-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k2
-; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k2}
-; AVX512-NEXT: kshiftrw $8, %k1, %k1
-; AVX512-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
-; AVX512-NEXT: kshiftrw $8, %k2, %k1
-; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
-; AVX512-NEXT: vmovaps %zmm1, %zmm0
-; AVX512-NEXT: vmovaps %zmm2, %zmm1
-; AVX512-NEXT: vmovaps %zmm3, %zmm2
-; AVX512-NEXT: vmovaps %zmm4, %zmm3
-; AVX512-NEXT: retq
+; AVX1-LABEL: test_load_32f64:
+; AVX1: ## BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: Ltmp0:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: Ltmp1:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: Ltmp2:
+; AVX1-NEXT: .cfi_def_cfa_register %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: vmovapd 16(%rbp), %ymm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm9, %xmm9
+; AVX1-NEXT: vpsrad $31, %xmm9, %xmm9
+; AVX1-NEXT: vpmovsxdq %xmm9, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm9, %xmm9
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX1-NEXT: vmaskmovpd 32(%rsi), %ymm9, %ymm10
+; AVX1-NEXT: vblendvpd %ymm9, %ymm10, %ymm2, %ymm9
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2
+; AVX1-NEXT: vmaskmovpd 64(%rsi), %ymm2, %ymm10
+; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm3, %ymm11
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm10, %ymm2
+; AVX1-NEXT: vmaskmovpd 96(%rsi), %ymm2, %ymm10
+; AVX1-NEXT: vblendvpd %ymm2, %ymm10, %ymm4, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10
+; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT: vmaskmovpd 192(%rsi), %ymm3, %ymm10
+; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm7, %ymm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm3, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm10
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
+; AVX1-NEXT: vmaskmovpd 224(%rsi), %ymm3, %ymm10
+; AVX1-NEXT: vblendvpd %ymm3, %ymm10, %ymm8, %ymm3
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
+; AVX1-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm8
+; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm1, %ymm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovpd 128(%rsi), %ymm1, %ymm2
+; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
+; AVX1-NEXT: vmovapd %ymm1, 128(%rdi)
+; AVX1-NEXT: vmovapd %ymm0, (%rdi)
+; AVX1-NEXT: vmovapd %ymm3, 224(%rdi)
+; AVX1-NEXT: vmovapd %ymm7, 192(%rdi)
+; AVX1-NEXT: vmovapd %ymm6, 160(%rdi)
+; AVX1-NEXT: vmovapd %ymm4, 96(%rdi)
+; AVX1-NEXT: vmovapd %ymm11, 64(%rdi)
+; AVX1-NEXT: vmovapd %ymm9, 32(%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_32f64:
; AVX2: ## BB#0:
@@ -670,27 +2119,28 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX2-NEXT: .cfi_def_cfa_register %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $32, %rsp
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm8, %xmm8
-; AVX2-NEXT: vpsrad $31, %xmm8, %xmm8
-; AVX2-NEXT: vpmovsxdq %xmm8, %ymm8
-; AVX2-NEXT: vmaskmovpd 32(%rsi), %ymm8, %ymm9
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm10, %xmm10
-; AVX2-NEXT: vpsrad $31, %xmm10, %xmm10
-; AVX2-NEXT: vpmovsxdq %xmm10, %ymm10
-; AVX2-NEXT: vmaskmovpd 64(%rsi), %ymm10, %ymm11
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[3,1,2,3]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm12, %xmm12
-; AVX2-NEXT: vpsrad $31, %xmm12, %xmm12
-; AVX2-NEXT: vpmovsxdq %xmm12, %ymm12
-; AVX2-NEXT: vmaskmovpd 96(%rsi), %ymm12, %ymm13
-; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm2, %ymm8
-; AVX2-NEXT: vblendvpd %ymm10, %ymm11, %ymm3, %ymm9
-; AVX2-NEXT: vblendvpd %ymm12, %ymm13, %ymm4, %ymm11
+; AVX2-NEXT: vmovapd 16(%rbp), %ymm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm9, %xmm9
+; AVX2-NEXT: vpsrad $31, %xmm9, %xmm9
+; AVX2-NEXT: vpmovsxdq %xmm9, %ymm9
+; AVX2-NEXT: vmaskmovpd 32(%rsi), %ymm9, %ymm10
+; AVX2-NEXT: vblendvpd %ymm9, %ymm10, %ymm2, %ymm9
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vmaskmovpd 64(%rsi), %ymm2, %ymm10
+; AVX2-NEXT: vblendvpd %ymm2, %ymm10, %ymm3, %ymm11
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vmaskmovpd 96(%rsi), %ymm2, %ymm10
+; AVX2-NEXT: vblendvpd %ymm2, %ymm10, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
@@ -698,28 +2148,27 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX2-NEXT: vpslld $31, %xmm4, %xmm4
-; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
-; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4
-; AVX2-NEXT: vmaskmovpd 192(%rsi), %ymm4, %ymm12
; AVX2-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm3
-; AVX2-NEXT: vmovapd 16(%rbp), %ymm6
-; AVX2-NEXT: vblendvpd %ymm4, %ymm12, %ymm7, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
+; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
+; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
+; AVX2-NEXT: vmaskmovpd 192(%rsi), %ymm6, %ymm10
+; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm7, %ymm6
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
; AVX2-NEXT: vmaskmovpd 224(%rsi), %ymm7, %ymm10
-; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm6, %ymm6
+; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm8, %ymm7
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm7
-; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm1, %ymm0
+; AVX2-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm8
+; AVX2-NEXT: vblendvpd %ymm0, %ymm8, %ymm1, %ymm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
@@ -728,18 +2177,39 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
; AVX2-NEXT: vmovapd %ymm1, 128(%rdi)
; AVX2-NEXT: vmovapd %ymm0, (%rdi)
-; AVX2-NEXT: vmovapd %ymm6, 224(%rdi)
-; AVX2-NEXT: vmovapd %ymm4, 192(%rdi)
+; AVX2-NEXT: vmovapd %ymm7, 224(%rdi)
+; AVX2-NEXT: vmovapd %ymm6, 192(%rdi)
; AVX2-NEXT: vmovapd %ymm3, 160(%rdi)
-; AVX2-NEXT: vmovapd %ymm11, 96(%rdi)
-; AVX2-NEXT: vmovapd %ymm9, 64(%rdi)
-; AVX2-NEXT: vmovapd %ymm8, 32(%rdi)
+; AVX2-NEXT: vmovapd %ymm4, 96(%rdi)
+; AVX2-NEXT: vmovapd %ymm11, 64(%rdi)
+; AVX2-NEXT: vmovapd %ymm9, 32(%rdi)
; AVX2-NEXT: movq %rdi, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
+; AVX512F-LABEL: test_load_32f64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
+; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
+; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1
+; AVX512F-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
+; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k2}
+; AVX512F-NEXT: kshiftrw $8, %k1, %k1
+; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
+; AVX512F-NEXT: kshiftrw $8, %k2, %k1
+; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps %zmm2, %zmm1
+; AVX512F-NEXT: vmovaps %zmm3, %zmm2
+; AVX512F-NEXT: vmovaps %zmm4, %zmm3
+; AVX512F-NEXT: retq
+;
; SKX-LABEL: test_load_32f64:
; SKX: ## BB#0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
@@ -756,7 +2226,8181 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; SKX-NEXT: vmovaps %zmm3, %zmm2
; SKX-NEXT: vmovaps %zmm4, %zmm3
; SKX-NEXT: retq
- %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
+ %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
ret <32 x double> %res
}
-declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
+
+declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
+
+define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; AVX-LABEL: test_mask_load_16xi8:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: ## implicit-def: %XMM1
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_2
+; AVX-NEXT: ## BB#1: ## %cond.load
+; AVX-NEXT: movzbl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: LBB50_2: ## %else
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_4
+; AVX-NEXT: ## BB#3: ## %cond.load1
+; AVX-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_4: ## %else2
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_6
+; AVX-NEXT: ## BB#5: ## %cond.load4
+; AVX-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_6: ## %else5
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_8
+; AVX-NEXT: ## BB#7: ## %cond.load7
+; AVX-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_8: ## %else8
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_10
+; AVX-NEXT: ## BB#9: ## %cond.load10
+; AVX-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_10: ## %else11
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_12
+; AVX-NEXT: ## BB#11: ## %cond.load13
+; AVX-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_12: ## %else14
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_14
+; AVX-NEXT: ## BB#13: ## %cond.load16
+; AVX-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_14: ## %else17
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_16
+; AVX-NEXT: ## BB#15: ## %cond.load19
+; AVX-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_16: ## %else20
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_18
+; AVX-NEXT: ## BB#17: ## %cond.load22
+; AVX-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_18: ## %else23
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_20
+; AVX-NEXT: ## BB#19: ## %cond.load25
+; AVX-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_20: ## %else26
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_22
+; AVX-NEXT: ## BB#21: ## %cond.load28
+; AVX-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_22: ## %else29
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_24
+; AVX-NEXT: ## BB#23: ## %cond.load31
+; AVX-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_24: ## %else32
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_26
+; AVX-NEXT: ## BB#25: ## %cond.load34
+; AVX-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_26: ## %else35
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_28
+; AVX-NEXT: ## BB#27: ## %cond.load37
+; AVX-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_28: ## %else38
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_30
+; AVX-NEXT: ## BB#29: ## %cond.load40
+; AVX-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_30: ## %else41
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB50_32
+; AVX-NEXT: ## BB#31: ## %cond.load43
+; AVX-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB50_32: ## %else44
+; AVX-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_16xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %XMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB50_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_26: ## %else35
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_28: ## %else38
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_30: ## %else41
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB50_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB50_32: ## %else44
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_16xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+
+define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; AVX1-LABEL: test_mask_load_32xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: ## implicit-def: %YMM1
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzbl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: LBB51_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB51_32: ## %else44
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_34: ## %else47
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_36: ## %else50
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_38: ## %else53
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_40: ## %else56
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_42: ## %else59
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_44: ## %else62
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_46: ## %else65
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_48: ## %else68
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_50: ## %else71
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_52: ## %else74
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_54: ## %else77
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_56: ## %else80
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_58: ## %else83
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_60: ## %else86
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_62: ## %else89
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB51_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrb $15, 31(%rdi), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: LBB51_64: ## %else92
+; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_32xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: ## implicit-def: %YMM1
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzbl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: LBB51_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB51_32: ## %else44
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_34: ## %else47
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_36: ## %else50
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_38: ## %else53
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_40: ## %else56
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_42: ## %else59
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_44: ## %else62
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_46: ## %else65
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_48: ## %else68
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_50: ## %else71
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_52: ## %else74
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_54: ## %else77
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_56: ## %else80
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_58: ## %else83
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_60: ## %else86
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_62: ## %else89
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB51_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB51_64: ## %else92
+; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_32xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: ## implicit-def: %YMM1
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: LBB51_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_6: ## %else5
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_8: ## %else8
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_10: ## %else11
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_12: ## %else14
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_14: ## %else17
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_16: ## %else20
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_18: ## %else23
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_20: ## %else26
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_22: ## %else29
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_24: ## %else32
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_26: ## %else35
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_28: ## %else38
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_30: ## %else41
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB51_32: ## %else44
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_34: ## %else47
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_36: ## %else50
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_38: ## %else53
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_40: ## %else56
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_42: ## %else59
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_44: ## %else62
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_46: ## %else65
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_48: ## %else68
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_50: ## %else71
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_52: ## %else74
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_54: ## %else77
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_56: ## %else80
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_58: ## %else83
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_60: ## %else86
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_62: ## %else89
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB51_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB51_64: ## %else92
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_32xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
+
+define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
+; AVX1-LABEL: test_mask_load_64xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: Ltmp3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: Ltmp4:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: Ltmp5:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: Ltmp6:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: Ltmp7:
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: Ltmp8:
+; AVX1-NEXT: .cfi_def_cfa_offset 56
+; AVX1-NEXT: pushq %rax
+; AVX1-NEXT: Ltmp9:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: Ltmp10:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: Ltmp11:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: Ltmp12:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: Ltmp13:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: Ltmp14:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: Ltmp15:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movl %edi, %r13d
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB52_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzbl (%rax), %ebp
+; AVX1-NEXT: vmovd %ebp, %xmm9
+; AVX1-NEXT: LBB52_2: ## %else
+; AVX1-NEXT: testb $1, %sil
+; AVX1-NEXT: je LBB52_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrb $1, 1(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_4: ## %else2
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB52_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrb $2, 2(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_6: ## %else5
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB52_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrb $3, 3(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_8: ## %else8
+; AVX1-NEXT: testb $1, %r8b
+; AVX1-NEXT: je LBB52_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrb $4, 4(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_10: ## %else11
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r10b
+; AVX1-NEXT: testb $1, %r9b
+; AVX1-NEXT: je LBB52_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrb $5, 5(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_12: ## %else14
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r11b
+; AVX1-NEXT: testb $1, %r10b
+; AVX1-NEXT: je LBB52_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrb $6, 6(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_14: ## %else17
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r14b
+; AVX1-NEXT: testb $1, %r11b
+; AVX1-NEXT: je LBB52_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrb $7, 7(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_16: ## %else20
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r15b
+; AVX1-NEXT: testb $1, %r14b
+; AVX1-NEXT: je LBB52_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vpinsrb $8, 8(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_18: ## %else23
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %r12b
+; AVX1-NEXT: testb $1, %r15b
+; AVX1-NEXT: je LBB52_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vpinsrb $9, 9(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_20: ## %else26
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dil
+; AVX1-NEXT: testb $1, %r12b
+; AVX1-NEXT: je LBB52_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vpinsrb $10, 10(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_22: ## %else29
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bpl
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB52_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vpinsrb $11, 11(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_24: ## %else32
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %bl
+; AVX1-NEXT: testb $1, %bpl
+; AVX1-NEXT: je LBB52_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vpinsrb $12, 12(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_26: ## %else35
+; AVX1-NEXT: testb $1, %bl
+; AVX1-NEXT: je LBB52_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vpinsrb $13, 13(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_28: ## %else38
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vpinsrb $14, 14(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_30: ## %else41
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vpinsrb $15, 15(%rax), %xmm9, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX1-NEXT: LBB52_32: ## %else44
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_34: ## %else47
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_36: ## %else50
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_38: ## %else53
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_40: ## %else56
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_42: ## %else59
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_44: ## %else62
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_46: ## %else65
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_48: ## %else68
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_50: ## %else71
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_52: ## %else74
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_54: ## %else77
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_56: ## %else80
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_58: ## %else83
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_60: ## %else86
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_62: ## %else89
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm9
+; AVX1-NEXT: LBB52_64: ## %else92
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_66
+; AVX1-NEXT: ## BB#65: ## %cond.load94
+; AVX1-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: LBB52_66: ## %else95
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_68
+; AVX1-NEXT: ## BB#67: ## %cond.load97
+; AVX1-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_68: ## %else98
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_70
+; AVX1-NEXT: ## BB#69: ## %cond.load100
+; AVX1-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_70: ## %else101
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_72
+; AVX1-NEXT: ## BB#71: ## %cond.load103
+; AVX1-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_72: ## %else104
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_74
+; AVX1-NEXT: ## BB#73: ## %cond.load106
+; AVX1-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_74: ## %else107
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_76
+; AVX1-NEXT: ## BB#75: ## %cond.load109
+; AVX1-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_76: ## %else110
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_78
+; AVX1-NEXT: ## BB#77: ## %cond.load112
+; AVX1-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_78: ## %else113
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_80
+; AVX1-NEXT: ## BB#79: ## %cond.load115
+; AVX1-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_80: ## %else116
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_82
+; AVX1-NEXT: ## BB#81: ## %cond.load118
+; AVX1-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_82: ## %else119
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_84
+; AVX1-NEXT: ## BB#83: ## %cond.load121
+; AVX1-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_84: ## %else122
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_86
+; AVX1-NEXT: ## BB#85: ## %cond.load124
+; AVX1-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_86: ## %else125
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_88
+; AVX1-NEXT: ## BB#87: ## %cond.load127
+; AVX1-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_88: ## %else128
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_90
+; AVX1-NEXT: ## BB#89: ## %cond.load130
+; AVX1-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_90: ## %else131
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_92
+; AVX1-NEXT: ## BB#91: ## %cond.load133
+; AVX1-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_92: ## %else134
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_94
+; AVX1-NEXT: ## BB#93: ## %cond.load136
+; AVX1-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_94: ## %else137
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_96
+; AVX1-NEXT: ## BB#95: ## %cond.load139
+; AVX1-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB52_96: ## %else140
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_98
+; AVX1-NEXT: ## BB#97: ## %cond.load142
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_98: ## %else143
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_100
+; AVX1-NEXT: ## BB#99: ## %cond.load145
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_100: ## %else146
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_102
+; AVX1-NEXT: ## BB#101: ## %cond.load148
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_102: ## %else149
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_104
+; AVX1-NEXT: ## BB#103: ## %cond.load151
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_104: ## %else152
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_106
+; AVX1-NEXT: ## BB#105: ## %cond.load154
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_106: ## %else155
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_108
+; AVX1-NEXT: ## BB#107: ## %cond.load157
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_108: ## %else158
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_110
+; AVX1-NEXT: ## BB#109: ## %cond.load160
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_110: ## %else161
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_112
+; AVX1-NEXT: ## BB#111: ## %cond.load163
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_112: ## %else164
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_114
+; AVX1-NEXT: ## BB#113: ## %cond.load166
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_114: ## %else167
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_116
+; AVX1-NEXT: ## BB#115: ## %cond.load169
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_116: ## %else170
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_118
+; AVX1-NEXT: ## BB#117: ## %cond.load172
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_118: ## %else173
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_120
+; AVX1-NEXT: ## BB#119: ## %cond.load175
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_120: ## %else176
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_122
+; AVX1-NEXT: ## BB#121: ## %cond.load178
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_122: ## %else179
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_124
+; AVX1-NEXT: ## BB#123: ## %cond.load181
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_124: ## %else182
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: je LBB52_126
+; AVX1-NEXT: ## BB#125: ## %cond.load184
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_126: ## %else185
+; AVX1-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX1-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %r8d, (%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: je LBB52_128
+; AVX1-NEXT: ## BB#127: ## %cond.load187
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrb $15, 63(%rax), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB52_128: ## %else188
+; AVX1-NEXT: movzbl %r10b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r11b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r14b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r15b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %r12b, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %dil, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %bpl, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl %bl, %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX1-NEXT: movzbl %r13b, %r13d
+; AVX1-NEXT: vmovd %r13d, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload
+; AVX1-NEXT: movzbl %dil, %ebp
+; AVX1-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl (%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX1-NEXT: movzbl %bpl, %ebp
+; AVX1-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4
+; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: ## xmm5 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm8 ## 4-byte Folded Reload
+; AVX1-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm6 ## 4-byte Folded Reload
+; AVX1-NEXT: ## xmm6 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX1-NEXT: vpinsrb $2, %r12d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $3, %r15d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $4, %r14d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $6, %r8d, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $7, %edx, %xmm6, %xmm6
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX1-NEXT: vpinsrb $10, %esi, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX1-NEXT: vpinsrb $11, %r9d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX1-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX1-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX1-NEXT: vpinsrb $14, %r13d, %xmm6, %xmm6
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX1-NEXT: vpinsrb $15, %r14d, %xmm6, %xmm10
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX1-NEXT: vmovd %edi, %xmm7
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX1-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX1-NEXT: vpinsrb $1, %r11d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $2, %r15d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $3, %r12d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $4, %r8d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $7, %esi, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $10, %r13d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $11, %edx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $12, %r14d, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $13, %ebx, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $14, %edi, %xmm7, %xmm7
+; AVX1-NEXT: vpinsrb $15, %ebp, %xmm7, %xmm7
+; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $7, %xmm8, %xmm6
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vandnps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vandps %ymm4, %ymm9, %ymm4
+; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vpsllw $7, %xmm10, %xmm4
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllw $7, %xmm7, %xmm6
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm3, %ymm2
+; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: addq $8, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_64xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: Ltmp3:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: Ltmp4:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: Ltmp5:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: Ltmp6:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: Ltmp7:
+; AVX2-NEXT: .cfi_def_cfa_offset 48
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: Ltmp8:
+; AVX2-NEXT: .cfi_def_cfa_offset 56
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: Ltmp9:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: Ltmp10:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: Ltmp11:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: Ltmp12:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: Ltmp13:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: Ltmp14:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: Ltmp15:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: je LBB52_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzbl (%rax), %ebp
+; AVX2-NEXT: vmovd %ebp, %xmm2
+; AVX2-NEXT: LBB52_2: ## %else
+; AVX2-NEXT: testb $1, %sil
+; AVX2-NEXT: je LBB52_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrb $1, 1(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_4: ## %else2
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB52_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrb $2, 2(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_6: ## %else5
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB52_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrb $3, 3(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_8: ## %else8
+; AVX2-NEXT: testb $1, %r8b
+; AVX2-NEXT: je LBB52_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrb $4, 4(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_10: ## %else11
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r10b
+; AVX2-NEXT: testb $1, %r9b
+; AVX2-NEXT: je LBB52_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrb $5, 5(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_12: ## %else14
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r11b
+; AVX2-NEXT: testb $1, %r10b
+; AVX2-NEXT: je LBB52_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrb $6, 6(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_14: ## %else17
+; AVX2-NEXT: testb $1, %r11b
+; AVX2-NEXT: je LBB52_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrb $7, 7(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_16: ## %else20
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vpinsrb $8, 8(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_18: ## %else23
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vpinsrb $9, 9(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_20: ## %else26
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vpinsrb $10, 10(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_22: ## %else29
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bpl
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vpinsrb $11, 11(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_24: ## %else32
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %bl
+; AVX2-NEXT: testb $1, %bpl
+; AVX2-NEXT: je LBB52_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vpinsrb $12, 12(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_26: ## %else35
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r14b
+; AVX2-NEXT: testb $1, %bl
+; AVX2-NEXT: je LBB52_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vpinsrb $13, 13(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_28: ## %else38
+; AVX2-NEXT: testb $1, %r14b
+; AVX2-NEXT: je LBB52_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vpinsrb $14, 14(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_30: ## %else41
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r13b
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vpinsrb $15, 15(%rax), %xmm2, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: LBB52_32: ## %else44
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r12b
+; AVX2-NEXT: testb $1, %r13b
+; AVX2-NEXT: je LBB52_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $0, 16(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_34: ## %else47
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %r15b
+; AVX2-NEXT: testb $1, %r12b
+; AVX2-NEXT: je LBB52_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $1, 17(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_36: ## %else50
+; AVX2-NEXT: testb $1, %r15b
+; AVX2-NEXT: je LBB52_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $2, 18(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_38: ## %else53
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $3, 19(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_40: ## %else56
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $4, 20(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_42: ## %else59
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $5, 21(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_44: ## %else62
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $6, 22(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_46: ## %else65
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $7, 23(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_48: ## %else68
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $8, 24(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_50: ## %else71
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $9, 25(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_52: ## %else74
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $10, 26(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_54: ## %else77
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $11, 27(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_56: ## %else80
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $12, 28(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_58: ## %else83
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $13, 29(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_60: ## %else86
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $14, 30(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_62: ## %else89
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpinsrb $15, 31(%rax), %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: LBB52_64: ## %else92
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_66
+; AVX2-NEXT: ## BB#65: ## %cond.load94
+; AVX2-NEXT: vpinsrb $0, 32(%rax), %xmm0, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: LBB52_66: ## %else95
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_68
+; AVX2-NEXT: ## BB#67: ## %cond.load97
+; AVX2-NEXT: vpinsrb $1, 33(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_68: ## %else98
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_70
+; AVX2-NEXT: ## BB#69: ## %cond.load100
+; AVX2-NEXT: vpinsrb $2, 34(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_70: ## %else101
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_72
+; AVX2-NEXT: ## BB#71: ## %cond.load103
+; AVX2-NEXT: vpinsrb $3, 35(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_72: ## %else104
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_74
+; AVX2-NEXT: ## BB#73: ## %cond.load106
+; AVX2-NEXT: vpinsrb $4, 36(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_74: ## %else107
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_76
+; AVX2-NEXT: ## BB#75: ## %cond.load109
+; AVX2-NEXT: vpinsrb $5, 37(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_76: ## %else110
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_78
+; AVX2-NEXT: ## BB#77: ## %cond.load112
+; AVX2-NEXT: vpinsrb $6, 38(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_78: ## %else113
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_80
+; AVX2-NEXT: ## BB#79: ## %cond.load115
+; AVX2-NEXT: vpinsrb $7, 39(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_80: ## %else116
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_82
+; AVX2-NEXT: ## BB#81: ## %cond.load118
+; AVX2-NEXT: vpinsrb $8, 40(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_82: ## %else119
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_84
+; AVX2-NEXT: ## BB#83: ## %cond.load121
+; AVX2-NEXT: vpinsrb $9, 41(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_84: ## %else122
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_86
+; AVX2-NEXT: ## BB#85: ## %cond.load124
+; AVX2-NEXT: vpinsrb $10, 42(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_86: ## %else125
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_88
+; AVX2-NEXT: ## BB#87: ## %cond.load127
+; AVX2-NEXT: vpinsrb $11, 43(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_88: ## %else128
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_90
+; AVX2-NEXT: ## BB#89: ## %cond.load130
+; AVX2-NEXT: vpinsrb $12, 44(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_90: ## %else131
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_92
+; AVX2-NEXT: ## BB#91: ## %cond.load133
+; AVX2-NEXT: vpinsrb $13, 45(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_92: ## %else134
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_94
+; AVX2-NEXT: ## BB#93: ## %cond.load136
+; AVX2-NEXT: vpinsrb $14, 46(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_94: ## %else137
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_96
+; AVX2-NEXT: ## BB#95: ## %cond.load139
+; AVX2-NEXT: vpinsrb $15, 47(%rax), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB52_96: ## %else140
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_98
+; AVX2-NEXT: ## BB#97: ## %cond.load142
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $0, 48(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_98: ## %else143
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_100
+; AVX2-NEXT: ## BB#99: ## %cond.load145
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $1, 49(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_100: ## %else146
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_102
+; AVX2-NEXT: ## BB#101: ## %cond.load148
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $2, 50(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_102: ## %else149
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_104
+; AVX2-NEXT: ## BB#103: ## %cond.load151
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $3, 51(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_104: ## %else152
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_106
+; AVX2-NEXT: ## BB#105: ## %cond.load154
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $4, 52(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_106: ## %else155
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_108
+; AVX2-NEXT: ## BB#107: ## %cond.load157
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $5, 53(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_108: ## %else158
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_110
+; AVX2-NEXT: ## BB#109: ## %cond.load160
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $6, 54(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_110: ## %else161
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_112
+; AVX2-NEXT: ## BB#111: ## %cond.load163
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $7, 55(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_112: ## %else164
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_114
+; AVX2-NEXT: ## BB#113: ## %cond.load166
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $8, 56(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_114: ## %else167
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_116
+; AVX2-NEXT: ## BB#115: ## %cond.load169
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $9, 57(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_116: ## %else170
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_118
+; AVX2-NEXT: ## BB#117: ## %cond.load172
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $10, 58(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_118: ## %else173
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_120
+; AVX2-NEXT: ## BB#119: ## %cond.load175
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $11, 59(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_120: ## %else176
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_122
+; AVX2-NEXT: ## BB#121: ## %cond.load178
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $12, 60(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_122: ## %else179
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: je LBB52_124
+; AVX2-NEXT: ## BB#123: ## %cond.load181
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $13, 61(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_124: ## %else182
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: jne LBB52_126
+; AVX2-NEXT: ## BB#125:
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: jmp LBB52_127
+; AVX2-NEXT: LBB52_126: ## %cond.load184
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: vpinsrb $14, 62(%rax), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_127: ## %else185
+; AVX2-NEXT: movl %ebp, %eax
+; AVX2-NEXT: testb $1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %r9d, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %r8d, (%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl %esi, %ebp
+; AVX2-NEXT: je LBB52_129
+; AVX2-NEXT: ## BB#128: ## %cond.load187
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrb $15, 63(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB52_129: ## %else188
+; AVX2-NEXT: movzbl %r10b, %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r11b, %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %bl, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r14b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r12b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r13b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl %r15b, %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %edi ## 4-byte Reload
+; AVX2-NEXT: movzbl %dil, %r13d
+; AVX2-NEXT: vmovd %r13d, %xmm4
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $1, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $2, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $3, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl (%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $4, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX2-NEXT: movzbl %bpl, %ebp
+; AVX2-NEXT: vpinsrb $5, %ebp, %xmm4, %xmm4
+; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm4, %xmm4 ## 4-byte Folded Reload
+; AVX2-NEXT: vmovd -{{[0-9]+}}(%rsp), %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: ## xmm5 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm5, %xmm5 ## 4-byte Folded Reload
+; AVX2-NEXT: vmovd %r12d, %xmm6
+; AVX2-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX2-NEXT: vpinsrb $2, %r15d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $3, %r14d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $4, %ebx, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $5, %r11d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $7, %esi, %xmm6, %xmm6
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r9d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: vpinsrb $10, %edx, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d
+; AVX2-NEXT: vpinsrb $11, %r8d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r13d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX2-NEXT: vpinsrb $12, %r10d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
+; AVX2-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi
+; AVX2-NEXT: vpinsrb $15, %r15d, %xmm6, %xmm6
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi
+; AVX2-NEXT: vmovd %r12d, %xmm7
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d
+; AVX2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
+; AVX2-NEXT: vpinsrb $1, %r9d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $2, %r11d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $3, %r14d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $4, %r13d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $6, %r8d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $7, %ebx, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $8, %r10d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $9, %ebp, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $11, %edi, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $12, %r15d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $13, %esi, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $14, %r12d, %xmm7, %xmm7
+; AVX2-NEXT: vpinsrb $15, %edx, %xmm7, %xmm7
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX2-NEXT: vpsllw $7, %ymm4, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm2
+; AVX2-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_64xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: Ltmp0:
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: Ltmp1:
+; AVX512F-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: Ltmp2:
+; AVX512F-NEXT: .cfi_def_cfa_offset 32
+; AVX512F-NEXT: pushq %r13
+; AVX512F-NEXT: Ltmp3:
+; AVX512F-NEXT: .cfi_def_cfa_offset 40
+; AVX512F-NEXT: pushq %r12
+; AVX512F-NEXT: Ltmp4:
+; AVX512F-NEXT: .cfi_def_cfa_offset 48
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: Ltmp5:
+; AVX512F-NEXT: .cfi_def_cfa_offset 56
+; AVX512F-NEXT: subq $76, %rsp
+; AVX512F-NEXT: Ltmp6:
+; AVX512F-NEXT: .cfi_def_cfa_offset 132
+; AVX512F-NEXT: Ltmp7:
+; AVX512F-NEXT: .cfi_offset %rbx, -56
+; AVX512F-NEXT: Ltmp8:
+; AVX512F-NEXT: .cfi_offset %r12, -48
+; AVX512F-NEXT: Ltmp9:
+; AVX512F-NEXT: .cfi_offset %r13, -40
+; AVX512F-NEXT: Ltmp10:
+; AVX512F-NEXT: .cfi_offset %r14, -32
+; AVX512F-NEXT: Ltmp11:
+; AVX512F-NEXT: .cfi_offset %r15, -24
+; AVX512F-NEXT: Ltmp12:
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzbl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB52_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrb $4, 4(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrb $5, 5(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vpinsrb $8, 8(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, (%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vpinsrb $9, 9(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vpinsrb $10, 10(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vpinsrb $11, 11(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vpinsrb $12, 12(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_26: ## %else35
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vpinsrb $13, 13(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_28: ## %else38
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vpinsrb $14, 14(%rdi), %xmm0, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_30: ## %else41
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vpinsrb $15, 15(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_32: ## %else44
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_34: ## %else47
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $1, 17(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_36: ## %else50
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $2, 18(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_38: ## %else53
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $3, 19(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_40: ## %else56
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $4, 20(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_42: ## %else59
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $5, 21(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_44: ## %else62
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $6, 22(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_46: ## %else65
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $7, 23(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_48: ## %else68
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $8, 24(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_50: ## %else71
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $9, 25(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_52: ## %else74
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $10, 26(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_54: ## %else77
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $11, 27(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_56: ## %else80
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $12, 28(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_58: ## %else83
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm1
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_60: ## %else86
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpinsrb $14, 30(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_62: ## %else89
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrb $15, 31(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB52_64: ## %else92
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_66
+; AVX512F-NEXT: ## BB#65: ## %cond.load94
+; AVX512F-NEXT: vpinsrb $0, 32(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB52_66: ## %else95
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_68
+; AVX512F-NEXT: ## BB#67: ## %cond.load97
+; AVX512F-NEXT: vpinsrb $1, 33(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_68: ## %else98
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_70
+; AVX512F-NEXT: ## BB#69: ## %cond.load100
+; AVX512F-NEXT: vpinsrb $2, 34(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_70: ## %else101
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_72
+; AVX512F-NEXT: ## BB#71: ## %cond.load103
+; AVX512F-NEXT: vpinsrb $3, 35(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_72: ## %else104
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_74
+; AVX512F-NEXT: ## BB#73: ## %cond.load106
+; AVX512F-NEXT: vpinsrb $4, 36(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_74: ## %else107
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_76
+; AVX512F-NEXT: ## BB#75: ## %cond.load109
+; AVX512F-NEXT: vpinsrb $5, 37(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_76: ## %else110
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_78
+; AVX512F-NEXT: ## BB#77: ## %cond.load112
+; AVX512F-NEXT: vpinsrb $6, 38(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_78: ## %else113
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_80
+; AVX512F-NEXT: ## BB#79: ## %cond.load115
+; AVX512F-NEXT: vpinsrb $7, 39(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_80: ## %else116
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_82
+; AVX512F-NEXT: ## BB#81: ## %cond.load118
+; AVX512F-NEXT: vpinsrb $8, 40(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_82: ## %else119
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_84
+; AVX512F-NEXT: ## BB#83: ## %cond.load121
+; AVX512F-NEXT: vpinsrb $9, 41(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_84: ## %else122
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_86
+; AVX512F-NEXT: ## BB#85: ## %cond.load124
+; AVX512F-NEXT: vpinsrb $10, 42(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_86: ## %else125
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_88
+; AVX512F-NEXT: ## BB#87: ## %cond.load127
+; AVX512F-NEXT: vpinsrb $11, 43(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_88: ## %else128
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_90
+; AVX512F-NEXT: ## BB#89: ## %cond.load130
+; AVX512F-NEXT: vpinsrb $12, 44(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_90: ## %else131
+; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm2
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_92
+; AVX512F-NEXT: ## BB#91: ## %cond.load133
+; AVX512F-NEXT: vpinsrb $13, 45(%rdi), %xmm1, %xmm3
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_92: ## %else134
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_94
+; AVX512F-NEXT: ## BB#93: ## %cond.load136
+; AVX512F-NEXT: vpinsrb $14, 46(%rdi), %xmm1, %xmm3
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_94: ## %else137
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_96
+; AVX512F-NEXT: ## BB#95: ## %cond.load139
+; AVX512F-NEXT: vpinsrb $15, 47(%rdi), %xmm1, %xmm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: LBB52_96: ## %else140
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_98
+; AVX512F-NEXT: ## BB#97: ## %cond.load142
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $0, 48(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_98: ## %else143
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_100
+; AVX512F-NEXT: ## BB#99: ## %cond.load145
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $1, 49(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_100: ## %else146
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_102
+; AVX512F-NEXT: ## BB#101: ## %cond.load148
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $2, 50(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_102: ## %else149
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_104
+; AVX512F-NEXT: ## BB#103: ## %cond.load151
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $3, 51(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_104: ## %else152
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_106
+; AVX512F-NEXT: ## BB#105: ## %cond.load154
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $4, 52(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_106: ## %else155
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_108
+; AVX512F-NEXT: ## BB#107: ## %cond.load157
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $5, 53(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_108: ## %else158
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_110
+; AVX512F-NEXT: ## BB#109: ## %cond.load160
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $6, 54(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_110: ## %else161
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_112
+; AVX512F-NEXT: ## BB#111: ## %cond.load163
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $7, 55(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_112: ## %else164
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, -{{[0-9]+}}(%rsp) ## 2-byte Spill
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_114
+; AVX512F-NEXT: ## BB#113: ## %cond.load166
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $8, 56(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_114: ## %else167
+; AVX512F-NEXT: kshiftlw $6, %k1, %k2
+; AVX512F-NEXT: kshiftrw $15, %k2, %k2
+; AVX512F-NEXT: kmovw %k2, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_116
+; AVX512F-NEXT: ## BB#115: ## %cond.load169
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $9, 57(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_116: ## %else170
+; AVX512F-NEXT: kshiftlw $5, %k1, %k3
+; AVX512F-NEXT: kshiftrw $15, %k3, %k3
+; AVX512F-NEXT: kmovw %k3, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_118
+; AVX512F-NEXT: ## BB#117: ## %cond.load172
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $10, 58(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_118: ## %else173
+; AVX512F-NEXT: kshiftlw $4, %k1, %k4
+; AVX512F-NEXT: kshiftrw $15, %k4, %k4
+; AVX512F-NEXT: kmovw %k4, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_120
+; AVX512F-NEXT: ## BB#119: ## %cond.load175
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $11, 59(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_120: ## %else176
+; AVX512F-NEXT: kshiftlw $3, %k1, %k5
+; AVX512F-NEXT: kshiftrw $15, %k5, %k5
+; AVX512F-NEXT: kmovw %k5, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_122
+; AVX512F-NEXT: ## BB#121: ## %cond.load178
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $12, 60(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_122: ## %else179
+; AVX512F-NEXT: kshiftlw $2, %k1, %k6
+; AVX512F-NEXT: kshiftrw $15, %k6, %k6
+; AVX512F-NEXT: kmovw %k6, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_124
+; AVX512F-NEXT: ## BB#123: ## %cond.load181
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $13, 61(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_124: ## %else182
+; AVX512F-NEXT: kshiftlw $1, %k1, %k7
+; AVX512F-NEXT: kshiftrw $15, %k7, %k7
+; AVX512F-NEXT: kmovw %k7, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_126
+; AVX512F-NEXT: ## BB#125: ## %cond.load184
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $14, 62(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_126: ## %else185
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB52_128
+; AVX512F-NEXT: ## BB#127: ## %cond.load187
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpinsrb $15, 63(%rdi), %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: LBB52_128: ## %else188
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw (%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, (%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw %k2, %eax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw %k3, %r12d
+; AVX512F-NEXT: kmovw %k4, %r15d
+; AVX512F-NEXT: kmovw %k5, %r14d
+; AVX512F-NEXT: kmovw %k6, %ebx
+; AVX512F-NEXT: kmovw %k7, %r11d
+; AVX512F-NEXT: kmovw %k1, %r10d
+; AVX512F-NEXT: kmovw -{{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ## 4-byte Spill
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r8d
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r9d
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %edi
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %esi
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %r13d ## 4-byte Reload
+; AVX512F-NEXT: vmovd %r13d, %xmm2
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm2, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: movl {{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX512F-NEXT: vmovd %ebp, %xmm3
+; AVX512F-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, (%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $10, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $11, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $12, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $13, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $14, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $15, -{{[0-9]+}}(%rsp), %xmm3, %xmm3 ## 4-byte Folded Reload
+; AVX512F-NEXT: movl -{{[0-9]+}}(%rsp), %ebp ## 4-byte Reload
+; AVX512F-NEXT: vmovd %ebp, %xmm6
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $3, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $4, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $5, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $6, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $7, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $8, -{{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm6, %xmm6 ## 4-byte Folded Reload
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r13d
+; AVX512F-NEXT: vpinsrb $10, %r12d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r12d
+; AVX512F-NEXT: vpinsrb $11, %r15d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r15d
+; AVX512F-NEXT: vpinsrb $12, %r14d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r14d
+; AVX512F-NEXT: vpinsrb $13, %ebx, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %ebx
+; AVX512F-NEXT: vpinsrb $14, %r11d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r11d
+; AVX512F-NEXT: vpinsrb $15, %r10d, %xmm6, %xmm6
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r10d
+; AVX512F-NEXT: vmovd %r8d, %xmm7
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; AVX512F-NEXT: kmovw %k0, %r8d
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpinsrb $1, -{{[0-9]+}}(%rsp), %xmm7, %xmm2 ## 4-byte Folded Reload
+; AVX512F-NEXT: vpinsrb $2, %r9d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $3, %edi, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $4, %esi, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $8, %r13d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $9, %r12d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $10, %r15d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $11, %r14d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $12, %ebx, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $13, %r11d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $14, %r10d, %xmm2, %xmm2
+; AVX512F-NEXT: vpinsrb $15, %r8d, %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: addq $76, %rsp
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r12
+; AVX512F-NEXT: popq %r13
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_64xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val)
+ ret <64 x i8> %res
+}
+declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <64 x i8>)
+
+define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; AVX-LABEL: test_mask_load_8xi16:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: ## implicit-def: %XMM1
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_2
+; AVX-NEXT: ## BB#1: ## %cond.load
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: LBB53_2: ## %else
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_4
+; AVX-NEXT: ## BB#3: ## %cond.load1
+; AVX-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_4: ## %else2
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_6
+; AVX-NEXT: ## BB#5: ## %cond.load4
+; AVX-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_6: ## %else5
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_8
+; AVX-NEXT: ## BB#7: ## %cond.load7
+; AVX-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_8: ## %else8
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_10
+; AVX-NEXT: ## BB#9: ## %cond.load10
+; AVX-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_10: ## %else11
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_12
+; AVX-NEXT: ## BB#11: ## %cond.load13
+; AVX-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_12: ## %else14
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_14
+; AVX-NEXT: ## BB#13: ## %cond.load16
+; AVX-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_14: ## %else17
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB53_16
+; AVX-NEXT: ## BB#15: ## %cond.load19
+; AVX-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1
+; AVX-NEXT: LBB53_16: ## %else20
+; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_8xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %XMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB53_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB53_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: LBB53_16: ## %else20
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_8xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+
+define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; AVX1-LABEL: test_mask_load_16xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: ## implicit-def: %YMM1
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: LBB54_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: LBB54_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB54_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: LBB54_32: ## %else44
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_16xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: ## implicit-def: %YMM1
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: LBB54_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: LBB54_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB54_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: LBB54_32: ## %else44
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_16xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: ## implicit-def: %YMM0
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: LBB54_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_6: ## %else5
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_8: ## %else8
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_10: ## %else11
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_12: ## %else14
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_14: ## %else17
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB54_16: ## %else20
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_18: ## %else23
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_20: ## %else26
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_22: ## %else29
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_24: ## %else32
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_26: ## %else35
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_28: ## %else38
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_30: ## %else41
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB54_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: LBB54_32: ## %else44
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_16xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT: retq
+ %res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>)
+
+define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
+; AVX1-LABEL: test_mask_load_32xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_2
+; AVX1-NEXT: ## BB#1: ## %cond.load
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: LBB55_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_4
+; AVX1-NEXT: ## BB#3: ## %cond.load1
+; AVX1-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_6
+; AVX1-NEXT: ## BB#5: ## %cond.load4
+; AVX1-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_6: ## %else5
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_8
+; AVX1-NEXT: ## BB#7: ## %cond.load7
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_8: ## %else8
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_10
+; AVX1-NEXT: ## BB#9: ## %cond.load10
+; AVX1-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_10: ## %else11
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_12
+; AVX1-NEXT: ## BB#11: ## %cond.load13
+; AVX1-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_12: ## %else14
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_14
+; AVX1-NEXT: ## BB#13: ## %cond.load16
+; AVX1-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_14: ## %else17
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_16
+; AVX1-NEXT: ## BB#15: ## %cond.load19
+; AVX1-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: LBB55_16: ## %else20
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_18
+; AVX1-NEXT: ## BB#17: ## %cond.load22
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_18: ## %else23
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_20
+; AVX1-NEXT: ## BB#19: ## %cond.load25
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_20: ## %else26
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_22
+; AVX1-NEXT: ## BB#21: ## %cond.load28
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_22: ## %else29
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_24
+; AVX1-NEXT: ## BB#23: ## %cond.load31
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_24: ## %else32
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_26
+; AVX1-NEXT: ## BB#25: ## %cond.load34
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_26: ## %else35
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_28
+; AVX1-NEXT: ## BB#27: ## %cond.load37
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_28: ## %else38
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_30
+; AVX1-NEXT: ## BB#29: ## %cond.load40
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_30: ## %else41
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_32
+; AVX1-NEXT: ## BB#31: ## %cond.load43
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: LBB55_32: ## %else44
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_34
+; AVX1-NEXT: ## BB#33: ## %cond.load46
+; AVX1-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: LBB55_34: ## %else47
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_36
+; AVX1-NEXT: ## BB#35: ## %cond.load49
+; AVX1-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_36: ## %else50
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_38
+; AVX1-NEXT: ## BB#37: ## %cond.load52
+; AVX1-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_38: ## %else53
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_40
+; AVX1-NEXT: ## BB#39: ## %cond.load55
+; AVX1-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_40: ## %else56
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_42
+; AVX1-NEXT: ## BB#41: ## %cond.load58
+; AVX1-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_42: ## %else59
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_44
+; AVX1-NEXT: ## BB#43: ## %cond.load61
+; AVX1-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_44: ## %else62
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_46
+; AVX1-NEXT: ## BB#45: ## %cond.load64
+; AVX1-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_46: ## %else65
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_48
+; AVX1-NEXT: ## BB#47: ## %cond.load67
+; AVX1-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX1-NEXT: LBB55_48: ## %else68
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_50
+; AVX1-NEXT: ## BB#49: ## %cond.load70
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_50: ## %else71
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_52
+; AVX1-NEXT: ## BB#51: ## %cond.load73
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_52: ## %else74
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_54
+; AVX1-NEXT: ## BB#53: ## %cond.load76
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_54: ## %else77
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_56
+; AVX1-NEXT: ## BB#55: ## %cond.load79
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_56: ## %else80
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_58
+; AVX1-NEXT: ## BB#57: ## %cond.load82
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_58: ## %else83
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_60
+; AVX1-NEXT: ## BB#59: ## %cond.load85
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_60: ## %else86
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_62
+; AVX1-NEXT: ## BB#61: ## %cond.load88
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_62: ## %else89
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB55_64
+; AVX1-NEXT: ## BB#63: ## %cond.load91
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: LBB55_64: ## %else92
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm6, %xmm6
+; AVX1-NEXT: vpsraw $15, %xmm6, %xmm6
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vandnps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT: vandps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsllw $15, %xmm3, %xmm3
+; AVX1-NEXT: vpsraw $15, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vandnps %ymm2, %ymm1, %ymm2
+; AVX1-NEXT: vandps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_load_32xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_2
+; AVX2-NEXT: ## BB#1: ## %cond.load
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: LBB55_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_4
+; AVX2-NEXT: ## BB#3: ## %cond.load1
+; AVX2-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_6
+; AVX2-NEXT: ## BB#5: ## %cond.load4
+; AVX2-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_6: ## %else5
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_8
+; AVX2-NEXT: ## BB#7: ## %cond.load7
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_8: ## %else8
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_10
+; AVX2-NEXT: ## BB#9: ## %cond.load10
+; AVX2-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_10: ## %else11
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_12
+; AVX2-NEXT: ## BB#11: ## %cond.load13
+; AVX2-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_12: ## %else14
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_14
+; AVX2-NEXT: ## BB#13: ## %cond.load16
+; AVX2-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_14: ## %else17
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_16
+; AVX2-NEXT: ## BB#15: ## %cond.load19
+; AVX2-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: LBB55_16: ## %else20
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_18
+; AVX2-NEXT: ## BB#17: ## %cond.load22
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_18: ## %else23
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_20
+; AVX2-NEXT: ## BB#19: ## %cond.load25
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_20: ## %else26
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_22
+; AVX2-NEXT: ## BB#21: ## %cond.load28
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_22: ## %else29
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_24
+; AVX2-NEXT: ## BB#23: ## %cond.load31
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_24: ## %else32
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_26
+; AVX2-NEXT: ## BB#25: ## %cond.load34
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_26: ## %else35
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_28
+; AVX2-NEXT: ## BB#27: ## %cond.load37
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_28: ## %else38
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_30
+; AVX2-NEXT: ## BB#29: ## %cond.load40
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_30: ## %else41
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_32
+; AVX2-NEXT: ## BB#31: ## %cond.load43
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: LBB55_32: ## %else44
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpextrb $0, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_34
+; AVX2-NEXT: ## BB#33: ## %cond.load46
+; AVX2-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: LBB55_34: ## %else47
+; AVX2-NEXT: vpextrb $1, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_36
+; AVX2-NEXT: ## BB#35: ## %cond.load49
+; AVX2-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_36: ## %else50
+; AVX2-NEXT: vpextrb $2, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_38
+; AVX2-NEXT: ## BB#37: ## %cond.load52
+; AVX2-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_38: ## %else53
+; AVX2-NEXT: vpextrb $3, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_40
+; AVX2-NEXT: ## BB#39: ## %cond.load55
+; AVX2-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_40: ## %else56
+; AVX2-NEXT: vpextrb $4, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_42
+; AVX2-NEXT: ## BB#41: ## %cond.load58
+; AVX2-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_42: ## %else59
+; AVX2-NEXT: vpextrb $5, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_44
+; AVX2-NEXT: ## BB#43: ## %cond.load61
+; AVX2-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_44: ## %else62
+; AVX2-NEXT: vpextrb $6, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_46
+; AVX2-NEXT: ## BB#45: ## %cond.load64
+; AVX2-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_46: ## %else65
+; AVX2-NEXT: vpextrb $7, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_48
+; AVX2-NEXT: ## BB#47: ## %cond.load67
+; AVX2-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: LBB55_48: ## %else68
+; AVX2-NEXT: vpextrb $8, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_50
+; AVX2-NEXT: ## BB#49: ## %cond.load70
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_50: ## %else71
+; AVX2-NEXT: vpextrb $9, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_52
+; AVX2-NEXT: ## BB#51: ## %cond.load73
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_52: ## %else74
+; AVX2-NEXT: vpextrb $10, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_54
+; AVX2-NEXT: ## BB#53: ## %cond.load76
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_54: ## %else77
+; AVX2-NEXT: vpextrb $11, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_56
+; AVX2-NEXT: ## BB#55: ## %cond.load79
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_56: ## %else80
+; AVX2-NEXT: vpextrb $12, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_58
+; AVX2-NEXT: ## BB#57: ## %cond.load82
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_58: ## %else83
+; AVX2-NEXT: vpextrb $13, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_60
+; AVX2-NEXT: ## BB#59: ## %cond.load85
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_60: ## %else86
+; AVX2-NEXT: vpextrb $14, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_62
+; AVX2-NEXT: ## BB#61: ## %cond.load88
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_62: ## %else89
+; AVX2-NEXT: vpextrb $15, %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB55_64
+; AVX2-NEXT: ## BB#63: ## %cond.load91
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: LBB55_64: ## %else92
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_load_32xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_2
+; AVX512F-NEXT: ## BB#1: ## %cond.load
+; AVX512F-NEXT: movzwl (%rdi), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: LBB55_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_4
+; AVX512F-NEXT: ## BB#3: ## %cond.load1
+; AVX512F-NEXT: vpinsrw $1, 2(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_6
+; AVX512F-NEXT: ## BB#5: ## %cond.load4
+; AVX512F-NEXT: vpinsrw $2, 4(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_6: ## %else5
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_8
+; AVX512F-NEXT: ## BB#7: ## %cond.load7
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_8: ## %else8
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_10
+; AVX512F-NEXT: ## BB#9: ## %cond.load10
+; AVX512F-NEXT: vpinsrw $4, 8(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_10: ## %else11
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_12
+; AVX512F-NEXT: ## BB#11: ## %cond.load13
+; AVX512F-NEXT: vpinsrw $5, 10(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_12: ## %else14
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_14
+; AVX512F-NEXT: ## BB#13: ## %cond.load16
+; AVX512F-NEXT: vpinsrw $6, 12(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_14: ## %else17
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_16
+; AVX512F-NEXT: ## BB#15: ## %cond.load19
+; AVX512F-NEXT: vpinsrw $7, 14(%rdi), %xmm3, %xmm4
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: LBB55_16: ## %else20
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_18
+; AVX512F-NEXT: ## BB#17: ## %cond.load22
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_18: ## %else23
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_20
+; AVX512F-NEXT: ## BB#19: ## %cond.load25
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_20: ## %else26
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_22
+; AVX512F-NEXT: ## BB#21: ## %cond.load28
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $2, 20(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_22: ## %else29
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_24
+; AVX512F-NEXT: ## BB#23: ## %cond.load31
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $3, 22(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_24: ## %else32
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_26
+; AVX512F-NEXT: ## BB#25: ## %cond.load34
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_26: ## %else35
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_28
+; AVX512F-NEXT: ## BB#27: ## %cond.load37
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $5, 26(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_28: ## %else38
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_30
+; AVX512F-NEXT: ## BB#29: ## %cond.load40
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_30: ## %else41
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_32
+; AVX512F-NEXT: ## BB#31: ## %cond.load43
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: LBB55_32: ## %else44
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpextrb $0, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_34
+; AVX512F-NEXT: ## BB#33: ## %cond.load46
+; AVX512F-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm5
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: LBB55_34: ## %else47
+; AVX512F-NEXT: vpextrb $1, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_36
+; AVX512F-NEXT: ## BB#35: ## %cond.load49
+; AVX512F-NEXT: vpinsrw $1, 34(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_36: ## %else50
+; AVX512F-NEXT: vpextrb $2, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_38
+; AVX512F-NEXT: ## BB#37: ## %cond.load52
+; AVX512F-NEXT: vpinsrw $2, 36(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_38: ## %else53
+; AVX512F-NEXT: vpextrb $3, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_40
+; AVX512F-NEXT: ## BB#39: ## %cond.load55
+; AVX512F-NEXT: vpinsrw $3, 38(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_40: ## %else56
+; AVX512F-NEXT: vpextrb $4, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_42
+; AVX512F-NEXT: ## BB#41: ## %cond.load58
+; AVX512F-NEXT: vpinsrw $4, 40(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_42: ## %else59
+; AVX512F-NEXT: vpextrb $5, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_44
+; AVX512F-NEXT: ## BB#43: ## %cond.load61
+; AVX512F-NEXT: vpinsrw $5, 42(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_44: ## %else62
+; AVX512F-NEXT: vpextrb $6, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_46
+; AVX512F-NEXT: ## BB#45: ## %cond.load64
+; AVX512F-NEXT: vpinsrw $6, 44(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_46: ## %else65
+; AVX512F-NEXT: vpextrb $7, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_48
+; AVX512F-NEXT: ## BB#47: ## %cond.load67
+; AVX512F-NEXT: vpinsrw $7, 46(%rdi), %xmm5, %xmm6
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512F-NEXT: LBB55_48: ## %else68
+; AVX512F-NEXT: vpextrb $8, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_50
+; AVX512F-NEXT: ## BB#49: ## %cond.load70
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $0, 48(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_50: ## %else71
+; AVX512F-NEXT: vpextrb $9, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_52
+; AVX512F-NEXT: ## BB#51: ## %cond.load73
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $1, 50(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_52: ## %else74
+; AVX512F-NEXT: vpextrb $10, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_54
+; AVX512F-NEXT: ## BB#53: ## %cond.load76
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $2, 52(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_54: ## %else77
+; AVX512F-NEXT: vpextrb $11, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_56
+; AVX512F-NEXT: ## BB#55: ## %cond.load79
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $3, 54(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_56: ## %else80
+; AVX512F-NEXT: vpextrb $12, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_58
+; AVX512F-NEXT: ## BB#57: ## %cond.load82
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $4, 56(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_58: ## %else83
+; AVX512F-NEXT: vpextrb $13, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_60
+; AVX512F-NEXT: ## BB#59: ## %cond.load85
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $5, 58(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_60: ## %else86
+; AVX512F-NEXT: vpextrb $14, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_62
+; AVX512F-NEXT: ## BB#61: ## %cond.load88
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $6, 60(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_62: ## %else89
+; AVX512F-NEXT: vpextrb $15, %xmm4, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB55_64
+; AVX512F-NEXT: ## BB#63: ## %cond.load91
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpinsrw $7, 62(%rdi), %xmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX512F-NEXT: LBB55_64: ## %else92
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm0, %ymm0
+; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm5, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_load_32xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1}
+; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: retq
+ %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val)
+ ret <32 x i16> %res
+}
+declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>, <32 x i16>)
+
+define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
+; AVX-LABEL: test_mask_store_16xi8:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_2
+; AVX-NEXT: ## BB#1: ## %cond.store
+; AVX-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX-NEXT: LBB56_2: ## %else
+; AVX-NEXT: vpextrb $1, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_4
+; AVX-NEXT: ## BB#3: ## %cond.store1
+; AVX-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX-NEXT: LBB56_4: ## %else2
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_6
+; AVX-NEXT: ## BB#5: ## %cond.store3
+; AVX-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX-NEXT: LBB56_6: ## %else4
+; AVX-NEXT: vpextrb $3, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_8
+; AVX-NEXT: ## BB#7: ## %cond.store5
+; AVX-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX-NEXT: LBB56_8: ## %else6
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_10
+; AVX-NEXT: ## BB#9: ## %cond.store7
+; AVX-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX-NEXT: LBB56_10: ## %else8
+; AVX-NEXT: vpextrb $5, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_12
+; AVX-NEXT: ## BB#11: ## %cond.store9
+; AVX-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX-NEXT: LBB56_12: ## %else10
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_14
+; AVX-NEXT: ## BB#13: ## %cond.store11
+; AVX-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX-NEXT: LBB56_14: ## %else12
+; AVX-NEXT: vpextrb $7, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_16
+; AVX-NEXT: ## BB#15: ## %cond.store13
+; AVX-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX-NEXT: LBB56_16: ## %else14
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_18
+; AVX-NEXT: ## BB#17: ## %cond.store15
+; AVX-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX-NEXT: LBB56_18: ## %else16
+; AVX-NEXT: vpextrb $9, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_20
+; AVX-NEXT: ## BB#19: ## %cond.store17
+; AVX-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX-NEXT: LBB56_20: ## %else18
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_22
+; AVX-NEXT: ## BB#21: ## %cond.store19
+; AVX-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX-NEXT: LBB56_22: ## %else20
+; AVX-NEXT: vpextrb $11, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_24
+; AVX-NEXT: ## BB#23: ## %cond.store21
+; AVX-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX-NEXT: LBB56_24: ## %else22
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_26
+; AVX-NEXT: ## BB#25: ## %cond.store23
+; AVX-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX-NEXT: LBB56_26: ## %else24
+; AVX-NEXT: vpextrb $13, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_28
+; AVX-NEXT: ## BB#27: ## %cond.store25
+; AVX-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX-NEXT: LBB56_28: ## %else26
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_30
+; AVX-NEXT: ## BB#29: ## %cond.store27
+; AVX-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX-NEXT: LBB56_30: ## %else28
+; AVX-NEXT: vpextrb $15, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB56_32
+; AVX-NEXT: ## BB#31: ## %cond.store29
+; AVX-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX-NEXT: LBB56_32: ## %else30
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_16xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX512F-NEXT: LBB56_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX512F-NEXT: LBB56_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB56_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX512F-NEXT: LBB56_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB56_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX512F-NEXT: LBB56_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB56_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX512F-NEXT: LBB56_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB56_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX512F-NEXT: LBB56_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB56_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX512F-NEXT: LBB56_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB56_26: ## %else24
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX512F-NEXT: LBB56_28: ## %else26
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB56_30: ## %else28
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB56_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX512F-NEXT: LBB56_32: ## %else30
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_16xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+
+define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
+; AVX1-LABEL: test_mask_store_32xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX1-NEXT: LBB57_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX1-NEXT: LBB57_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB57_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX1-NEXT: LBB57_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB57_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX1-NEXT: LBB57_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB57_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX1-NEXT: LBB57_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB57_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX1-NEXT: LBB57_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB57_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX1-NEXT: LBB57_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB57_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX1-NEXT: LBB57_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB57_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX1-NEXT: LBB57_32: ## %else30
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX1-NEXT: LBB57_34: ## %else32
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX1-NEXT: LBB57_36: ## %else34
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX1-NEXT: LBB57_38: ## %else36
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX1-NEXT: LBB57_40: ## %else38
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX1-NEXT: LBB57_42: ## %else40
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX1-NEXT: LBB57_44: ## %else42
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX1-NEXT: LBB57_46: ## %else44
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX1-NEXT: LBB57_48: ## %else46
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX1-NEXT: LBB57_50: ## %else48
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX1-NEXT: LBB57_52: ## %else50
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX1-NEXT: LBB57_54: ## %else52
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX1-NEXT: LBB57_56: ## %else54
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX1-NEXT: LBB57_58: ## %else56
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX1-NEXT: LBB57_60: ## %else58
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX1-NEXT: LBB57_62: ## %else60
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB57_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX1-NEXT: LBB57_64: ## %else62
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_32xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX2-NEXT: LBB57_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX2-NEXT: LBB57_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB57_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX2-NEXT: LBB57_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB57_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX2-NEXT: LBB57_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB57_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX2-NEXT: LBB57_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB57_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX2-NEXT: LBB57_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB57_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX2-NEXT: LBB57_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB57_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX2-NEXT: LBB57_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB57_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX2-NEXT: LBB57_32: ## %else30
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX2-NEXT: LBB57_34: ## %else32
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX2-NEXT: LBB57_36: ## %else34
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX2-NEXT: LBB57_38: ## %else36
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX2-NEXT: LBB57_40: ## %else38
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX2-NEXT: LBB57_42: ## %else40
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX2-NEXT: LBB57_44: ## %else42
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX2-NEXT: LBB57_46: ## %else44
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX2-NEXT: LBB57_48: ## %else46
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX2-NEXT: LBB57_50: ## %else48
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX2-NEXT: LBB57_52: ## %else50
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX2-NEXT: LBB57_54: ## %else52
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX2-NEXT: LBB57_56: ## %else54
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX2-NEXT: LBB57_58: ## %else56
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX2-NEXT: LBB57_60: ## %else58
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX2-NEXT: LBB57_62: ## %else60
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB57_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX2-NEXT: LBB57_64: ## %else62
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_32xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm1, (%rdi)
+; AVX512F-NEXT: LBB57_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm1, 1(%rdi)
+; AVX512F-NEXT: LBB57_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB57_6: ## %else4
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm1, 3(%rdi)
+; AVX512F-NEXT: LBB57_8: ## %else6
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB57_10: ## %else8
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm1, 5(%rdi)
+; AVX512F-NEXT: LBB57_12: ## %else10
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB57_14: ## %else12
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm1, 7(%rdi)
+; AVX512F-NEXT: LBB57_16: ## %else14
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB57_18: ## %else16
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm1, 9(%rdi)
+; AVX512F-NEXT: LBB57_20: ## %else18
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB57_22: ## %else20
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm1, 11(%rdi)
+; AVX512F-NEXT: LBB57_24: ## %else22
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB57_26: ## %else24
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm1, 13(%rdi)
+; AVX512F-NEXT: LBB57_28: ## %else26
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB57_30: ## %else28
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm1, 15(%rdi)
+; AVX512F-NEXT: LBB57_32: ## %else30
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, 16(%rdi)
+; AVX512F-NEXT: LBB57_34: ## %else32
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $1, %xmm2, 17(%rdi)
+; AVX512F-NEXT: LBB57_36: ## %else34
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $2, %xmm2, 18(%rdi)
+; AVX512F-NEXT: LBB57_38: ## %else36
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $3, %xmm2, 19(%rdi)
+; AVX512F-NEXT: LBB57_40: ## %else38
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $4, %xmm2, 20(%rdi)
+; AVX512F-NEXT: LBB57_42: ## %else40
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $5, %xmm2, 21(%rdi)
+; AVX512F-NEXT: LBB57_44: ## %else42
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $6, %xmm2, 22(%rdi)
+; AVX512F-NEXT: LBB57_46: ## %else44
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $7, %xmm2, 23(%rdi)
+; AVX512F-NEXT: LBB57_48: ## %else46
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $8, %xmm2, 24(%rdi)
+; AVX512F-NEXT: LBB57_50: ## %else48
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $9, %xmm2, 25(%rdi)
+; AVX512F-NEXT: LBB57_52: ## %else50
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $10, %xmm2, 26(%rdi)
+; AVX512F-NEXT: LBB57_54: ## %else52
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $11, %xmm2, 27(%rdi)
+; AVX512F-NEXT: LBB57_56: ## %else54
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $12, %xmm2, 28(%rdi)
+; AVX512F-NEXT: LBB57_58: ## %else56
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $13, %xmm2, 29(%rdi)
+; AVX512F-NEXT: LBB57_60: ## %else58
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $14, %xmm2, 30(%rdi)
+; AVX512F-NEXT: LBB57_62: ## %else60
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB57_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX512F-NEXT: LBB57_64: ## %else62
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_32xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
+
+define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
+; AVX1-LABEL: test_mask_store_64xi8:
+; AVX1: ## BB#0:
+; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: testb $1, %dil
+; AVX1-NEXT: je LBB58_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vpextrb $0, %xmm0, (%rax)
+; AVX1-NEXT: LBB58_2: ## %else
+; AVX1-NEXT: testb $1, %sil
+; AVX1-NEXT: je LBB58_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rax)
+; AVX1-NEXT: LBB58_4: ## %else2
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rax)
+; AVX1-NEXT: LBB58_6: ## %else4
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rax)
+; AVX1-NEXT: LBB58_8: ## %else6
+; AVX1-NEXT: testb $1, %r8b
+; AVX1-NEXT: je LBB58_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrb $4, %xmm0, 4(%rax)
+; AVX1-NEXT: LBB58_10: ## %else8
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %r9b
+; AVX1-NEXT: je LBB58_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrb $5, %xmm0, 5(%rax)
+; AVX1-NEXT: LBB58_12: ## %else10
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrb $6, %xmm0, 6(%rax)
+; AVX1-NEXT: LBB58_14: ## %else12
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrb $7, %xmm0, 7(%rax)
+; AVX1-NEXT: LBB58_16: ## %else14
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vpextrb $8, %xmm0, 8(%rax)
+; AVX1-NEXT: LBB58_18: ## %else16
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vpextrb $9, %xmm0, 9(%rax)
+; AVX1-NEXT: LBB58_20: ## %else18
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vpextrb $10, %xmm0, 10(%rax)
+; AVX1-NEXT: LBB58_22: ## %else20
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vpextrb $11, %xmm0, 11(%rax)
+; AVX1-NEXT: LBB58_24: ## %else22
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vpextrb $12, %xmm0, 12(%rax)
+; AVX1-NEXT: LBB58_26: ## %else24
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vpextrb $13, %xmm0, 13(%rax)
+; AVX1-NEXT: LBB58_28: ## %else26
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vpextrb $14, %xmm0, 14(%rax)
+; AVX1-NEXT: LBB58_30: ## %else28
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vpextrb $15, %xmm0, 15(%rax)
+; AVX1-NEXT: LBB58_32: ## %else30
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, 16(%rax)
+; AVX1-NEXT: LBB58_34: ## %else32
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $1, %xmm2, 17(%rax)
+; AVX1-NEXT: LBB58_36: ## %else34
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $2, %xmm2, 18(%rax)
+; AVX1-NEXT: LBB58_38: ## %else36
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $3, %xmm2, 19(%rax)
+; AVX1-NEXT: LBB58_40: ## %else38
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $4, %xmm2, 20(%rax)
+; AVX1-NEXT: LBB58_42: ## %else40
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $5, %xmm2, 21(%rax)
+; AVX1-NEXT: LBB58_44: ## %else42
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $6, %xmm2, 22(%rax)
+; AVX1-NEXT: LBB58_46: ## %else44
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $7, %xmm2, 23(%rax)
+; AVX1-NEXT: LBB58_48: ## %else46
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $8, %xmm2, 24(%rax)
+; AVX1-NEXT: LBB58_50: ## %else48
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $9, %xmm2, 25(%rax)
+; AVX1-NEXT: LBB58_52: ## %else50
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $10, %xmm2, 26(%rax)
+; AVX1-NEXT: LBB58_54: ## %else52
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $11, %xmm2, 27(%rax)
+; AVX1-NEXT: LBB58_56: ## %else54
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $12, %xmm2, 28(%rax)
+; AVX1-NEXT: LBB58_58: ## %else56
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $13, %xmm2, 29(%rax)
+; AVX1-NEXT: LBB58_60: ## %else58
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpextrb $14, %xmm2, 30(%rax)
+; AVX1-NEXT: LBB58_62: ## %else60
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 31(%rax)
+; AVX1-NEXT: LBB58_64: ## %else62
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_66
+; AVX1-NEXT: ## BB#65: ## %cond.store63
+; AVX1-NEXT: vpextrb $0, %xmm1, 32(%rax)
+; AVX1-NEXT: LBB58_66: ## %else64
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_68
+; AVX1-NEXT: ## BB#67: ## %cond.store65
+; AVX1-NEXT: vpextrb $1, %xmm1, 33(%rax)
+; AVX1-NEXT: LBB58_68: ## %else66
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_70
+; AVX1-NEXT: ## BB#69: ## %cond.store67
+; AVX1-NEXT: vpextrb $2, %xmm1, 34(%rax)
+; AVX1-NEXT: LBB58_70: ## %else68
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_72
+; AVX1-NEXT: ## BB#71: ## %cond.store69
+; AVX1-NEXT: vpextrb $3, %xmm1, 35(%rax)
+; AVX1-NEXT: LBB58_72: ## %else70
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_74
+; AVX1-NEXT: ## BB#73: ## %cond.store71
+; AVX1-NEXT: vpextrb $4, %xmm1, 36(%rax)
+; AVX1-NEXT: LBB58_74: ## %else72
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_76
+; AVX1-NEXT: ## BB#75: ## %cond.store73
+; AVX1-NEXT: vpextrb $5, %xmm1, 37(%rax)
+; AVX1-NEXT: LBB58_76: ## %else74
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_78
+; AVX1-NEXT: ## BB#77: ## %cond.store75
+; AVX1-NEXT: vpextrb $6, %xmm1, 38(%rax)
+; AVX1-NEXT: LBB58_78: ## %else76
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_80
+; AVX1-NEXT: ## BB#79: ## %cond.store77
+; AVX1-NEXT: vpextrb $7, %xmm1, 39(%rax)
+; AVX1-NEXT: LBB58_80: ## %else78
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_82
+; AVX1-NEXT: ## BB#81: ## %cond.store79
+; AVX1-NEXT: vpextrb $8, %xmm1, 40(%rax)
+; AVX1-NEXT: LBB58_82: ## %else80
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_84
+; AVX1-NEXT: ## BB#83: ## %cond.store81
+; AVX1-NEXT: vpextrb $9, %xmm1, 41(%rax)
+; AVX1-NEXT: LBB58_84: ## %else82
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_86
+; AVX1-NEXT: ## BB#85: ## %cond.store83
+; AVX1-NEXT: vpextrb $10, %xmm1, 42(%rax)
+; AVX1-NEXT: LBB58_86: ## %else84
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_88
+; AVX1-NEXT: ## BB#87: ## %cond.store85
+; AVX1-NEXT: vpextrb $11, %xmm1, 43(%rax)
+; AVX1-NEXT: LBB58_88: ## %else86
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_90
+; AVX1-NEXT: ## BB#89: ## %cond.store87
+; AVX1-NEXT: vpextrb $12, %xmm1, 44(%rax)
+; AVX1-NEXT: LBB58_90: ## %else88
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_92
+; AVX1-NEXT: ## BB#91: ## %cond.store89
+; AVX1-NEXT: vpextrb $13, %xmm1, 45(%rax)
+; AVX1-NEXT: LBB58_92: ## %else90
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_94
+; AVX1-NEXT: ## BB#93: ## %cond.store91
+; AVX1-NEXT: vpextrb $14, %xmm1, 46(%rax)
+; AVX1-NEXT: LBB58_94: ## %else92
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_96
+; AVX1-NEXT: ## BB#95: ## %cond.store93
+; AVX1-NEXT: vpextrb $15, %xmm1, 47(%rax)
+; AVX1-NEXT: LBB58_96: ## %else94
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_98
+; AVX1-NEXT: ## BB#97: ## %cond.store95
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, 48(%rax)
+; AVX1-NEXT: LBB58_98: ## %else96
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_100
+; AVX1-NEXT: ## BB#99: ## %cond.store97
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $1, %xmm0, 49(%rax)
+; AVX1-NEXT: LBB58_100: ## %else98
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_102
+; AVX1-NEXT: ## BB#101: ## %cond.store99
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rax)
+; AVX1-NEXT: LBB58_102: ## %else100
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_104
+; AVX1-NEXT: ## BB#103: ## %cond.store101
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $3, %xmm0, 51(%rax)
+; AVX1-NEXT: LBB58_104: ## %else102
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_106
+; AVX1-NEXT: ## BB#105: ## %cond.store103
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $4, %xmm0, 52(%rax)
+; AVX1-NEXT: LBB58_106: ## %else104
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_108
+; AVX1-NEXT: ## BB#107: ## %cond.store105
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $5, %xmm0, 53(%rax)
+; AVX1-NEXT: LBB58_108: ## %else106
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_110
+; AVX1-NEXT: ## BB#109: ## %cond.store107
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $6, %xmm0, 54(%rax)
+; AVX1-NEXT: LBB58_110: ## %else108
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_112
+; AVX1-NEXT: ## BB#111: ## %cond.store109
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $7, %xmm0, 55(%rax)
+; AVX1-NEXT: LBB58_112: ## %else110
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_114
+; AVX1-NEXT: ## BB#113: ## %cond.store111
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $8, %xmm0, 56(%rax)
+; AVX1-NEXT: LBB58_114: ## %else112
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_116
+; AVX1-NEXT: ## BB#115: ## %cond.store113
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $9, %xmm0, 57(%rax)
+; AVX1-NEXT: LBB58_116: ## %else114
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_118
+; AVX1-NEXT: ## BB#117: ## %cond.store115
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $10, %xmm0, 58(%rax)
+; AVX1-NEXT: LBB58_118: ## %else116
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_120
+; AVX1-NEXT: ## BB#119: ## %cond.store117
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $11, %xmm0, 59(%rax)
+; AVX1-NEXT: LBB58_120: ## %else118
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_122
+; AVX1-NEXT: ## BB#121: ## %cond.store119
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $12, %xmm0, 60(%rax)
+; AVX1-NEXT: LBB58_122: ## %else120
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_124
+; AVX1-NEXT: ## BB#123: ## %cond.store121
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $13, %xmm0, 61(%rax)
+; AVX1-NEXT: LBB58_124: ## %else122
+; AVX1-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX1-NEXT: testb $1, %cl
+; AVX1-NEXT: je LBB58_126
+; AVX1-NEXT: ## BB#125: ## %cond.store123
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $14, %xmm0, 62(%rax)
+; AVX1-NEXT: LBB58_126: ## %else124
+; AVX1-NEXT: testb $1, %dl
+; AVX1-NEXT: je LBB58_128
+; AVX1-NEXT: ## BB#127: ## %cond.store125
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm0, 63(%rax)
+; AVX1-NEXT: LBB58_128: ## %else126
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_64xi8:
+; AVX2: ## BB#0:
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: je LBB58_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vpextrb $0, %xmm0, (%rax)
+; AVX2-NEXT: LBB58_2: ## %else
+; AVX2-NEXT: testb $1, %sil
+; AVX2-NEXT: je LBB58_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rax)
+; AVX2-NEXT: LBB58_4: ## %else2
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rax)
+; AVX2-NEXT: LBB58_6: ## %else4
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rax)
+; AVX2-NEXT: LBB58_8: ## %else6
+; AVX2-NEXT: testb $1, %r8b
+; AVX2-NEXT: je LBB58_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrb $4, %xmm0, 4(%rax)
+; AVX2-NEXT: LBB58_10: ## %else8
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %r9b
+; AVX2-NEXT: je LBB58_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrb $5, %xmm0, 5(%rax)
+; AVX2-NEXT: LBB58_12: ## %else10
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrb $6, %xmm0, 6(%rax)
+; AVX2-NEXT: LBB58_14: ## %else12
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrb $7, %xmm0, 7(%rax)
+; AVX2-NEXT: LBB58_16: ## %else14
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vpextrb $8, %xmm0, 8(%rax)
+; AVX2-NEXT: LBB58_18: ## %else16
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vpextrb $9, %xmm0, 9(%rax)
+; AVX2-NEXT: LBB58_20: ## %else18
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vpextrb $10, %xmm0, 10(%rax)
+; AVX2-NEXT: LBB58_22: ## %else20
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vpextrb $11, %xmm0, 11(%rax)
+; AVX2-NEXT: LBB58_24: ## %else22
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vpextrb $12, %xmm0, 12(%rax)
+; AVX2-NEXT: LBB58_26: ## %else24
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vpextrb $13, %xmm0, 13(%rax)
+; AVX2-NEXT: LBB58_28: ## %else26
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vpextrb $14, %xmm0, 14(%rax)
+; AVX2-NEXT: LBB58_30: ## %else28
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vpextrb $15, %xmm0, 15(%rax)
+; AVX2-NEXT: LBB58_32: ## %else30
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, 16(%rax)
+; AVX2-NEXT: LBB58_34: ## %else32
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm2, 17(%rax)
+; AVX2-NEXT: LBB58_36: ## %else34
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm2, 18(%rax)
+; AVX2-NEXT: LBB58_38: ## %else36
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm2, 19(%rax)
+; AVX2-NEXT: LBB58_40: ## %else38
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm2, 20(%rax)
+; AVX2-NEXT: LBB58_42: ## %else40
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm2, 21(%rax)
+; AVX2-NEXT: LBB58_44: ## %else42
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm2, 22(%rax)
+; AVX2-NEXT: LBB58_46: ## %else44
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm2, 23(%rax)
+; AVX2-NEXT: LBB58_48: ## %else46
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm2, 24(%rax)
+; AVX2-NEXT: LBB58_50: ## %else48
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm2, 25(%rax)
+; AVX2-NEXT: LBB58_52: ## %else50
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm2, 26(%rax)
+; AVX2-NEXT: LBB58_54: ## %else52
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm2, 27(%rax)
+; AVX2-NEXT: LBB58_56: ## %else54
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm2, 28(%rax)
+; AVX2-NEXT: LBB58_58: ## %else56
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm2, 29(%rax)
+; AVX2-NEXT: LBB58_60: ## %else58
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm2, 30(%rax)
+; AVX2-NEXT: LBB58_62: ## %else60
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 31(%rax)
+; AVX2-NEXT: LBB58_64: ## %else62
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_66
+; AVX2-NEXT: ## BB#65: ## %cond.store63
+; AVX2-NEXT: vpextrb $0, %xmm1, 32(%rax)
+; AVX2-NEXT: LBB58_66: ## %else64
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_68
+; AVX2-NEXT: ## BB#67: ## %cond.store65
+; AVX2-NEXT: vpextrb $1, %xmm1, 33(%rax)
+; AVX2-NEXT: LBB58_68: ## %else66
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_70
+; AVX2-NEXT: ## BB#69: ## %cond.store67
+; AVX2-NEXT: vpextrb $2, %xmm1, 34(%rax)
+; AVX2-NEXT: LBB58_70: ## %else68
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_72
+; AVX2-NEXT: ## BB#71: ## %cond.store69
+; AVX2-NEXT: vpextrb $3, %xmm1, 35(%rax)
+; AVX2-NEXT: LBB58_72: ## %else70
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_74
+; AVX2-NEXT: ## BB#73: ## %cond.store71
+; AVX2-NEXT: vpextrb $4, %xmm1, 36(%rax)
+; AVX2-NEXT: LBB58_74: ## %else72
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_76
+; AVX2-NEXT: ## BB#75: ## %cond.store73
+; AVX2-NEXT: vpextrb $5, %xmm1, 37(%rax)
+; AVX2-NEXT: LBB58_76: ## %else74
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_78
+; AVX2-NEXT: ## BB#77: ## %cond.store75
+; AVX2-NEXT: vpextrb $6, %xmm1, 38(%rax)
+; AVX2-NEXT: LBB58_78: ## %else76
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_80
+; AVX2-NEXT: ## BB#79: ## %cond.store77
+; AVX2-NEXT: vpextrb $7, %xmm1, 39(%rax)
+; AVX2-NEXT: LBB58_80: ## %else78
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_82
+; AVX2-NEXT: ## BB#81: ## %cond.store79
+; AVX2-NEXT: vpextrb $8, %xmm1, 40(%rax)
+; AVX2-NEXT: LBB58_82: ## %else80
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_84
+; AVX2-NEXT: ## BB#83: ## %cond.store81
+; AVX2-NEXT: vpextrb $9, %xmm1, 41(%rax)
+; AVX2-NEXT: LBB58_84: ## %else82
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_86
+; AVX2-NEXT: ## BB#85: ## %cond.store83
+; AVX2-NEXT: vpextrb $10, %xmm1, 42(%rax)
+; AVX2-NEXT: LBB58_86: ## %else84
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_88
+; AVX2-NEXT: ## BB#87: ## %cond.store85
+; AVX2-NEXT: vpextrb $11, %xmm1, 43(%rax)
+; AVX2-NEXT: LBB58_88: ## %else86
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_90
+; AVX2-NEXT: ## BB#89: ## %cond.store87
+; AVX2-NEXT: vpextrb $12, %xmm1, 44(%rax)
+; AVX2-NEXT: LBB58_90: ## %else88
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_92
+; AVX2-NEXT: ## BB#91: ## %cond.store89
+; AVX2-NEXT: vpextrb $13, %xmm1, 45(%rax)
+; AVX2-NEXT: LBB58_92: ## %else90
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_94
+; AVX2-NEXT: ## BB#93: ## %cond.store91
+; AVX2-NEXT: vpextrb $14, %xmm1, 46(%rax)
+; AVX2-NEXT: LBB58_94: ## %else92
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_96
+; AVX2-NEXT: ## BB#95: ## %cond.store93
+; AVX2-NEXT: vpextrb $15, %xmm1, 47(%rax)
+; AVX2-NEXT: LBB58_96: ## %else94
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_98
+; AVX2-NEXT: ## BB#97: ## %cond.store95
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, 48(%rax)
+; AVX2-NEXT: LBB58_98: ## %else96
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_100
+; AVX2-NEXT: ## BB#99: ## %cond.store97
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm0, 49(%rax)
+; AVX2-NEXT: LBB58_100: ## %else98
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_102
+; AVX2-NEXT: ## BB#101: ## %cond.store99
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $2, %xmm0, 50(%rax)
+; AVX2-NEXT: LBB58_102: ## %else100
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_104
+; AVX2-NEXT: ## BB#103: ## %cond.store101
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $3, %xmm0, 51(%rax)
+; AVX2-NEXT: LBB58_104: ## %else102
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_106
+; AVX2-NEXT: ## BB#105: ## %cond.store103
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $4, %xmm0, 52(%rax)
+; AVX2-NEXT: LBB58_106: ## %else104
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_108
+; AVX2-NEXT: ## BB#107: ## %cond.store105
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $5, %xmm0, 53(%rax)
+; AVX2-NEXT: LBB58_108: ## %else106
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_110
+; AVX2-NEXT: ## BB#109: ## %cond.store107
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $6, %xmm0, 54(%rax)
+; AVX2-NEXT: LBB58_110: ## %else108
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_112
+; AVX2-NEXT: ## BB#111: ## %cond.store109
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $7, %xmm0, 55(%rax)
+; AVX2-NEXT: LBB58_112: ## %else110
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_114
+; AVX2-NEXT: ## BB#113: ## %cond.store111
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $8, %xmm0, 56(%rax)
+; AVX2-NEXT: LBB58_114: ## %else112
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_116
+; AVX2-NEXT: ## BB#115: ## %cond.store113
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $9, %xmm0, 57(%rax)
+; AVX2-NEXT: LBB58_116: ## %else114
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_118
+; AVX2-NEXT: ## BB#117: ## %cond.store115
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $10, %xmm0, 58(%rax)
+; AVX2-NEXT: LBB58_118: ## %else116
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_120
+; AVX2-NEXT: ## BB#119: ## %cond.store117
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $11, %xmm0, 59(%rax)
+; AVX2-NEXT: LBB58_120: ## %else118
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_122
+; AVX2-NEXT: ## BB#121: ## %cond.store119
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $12, %xmm0, 60(%rax)
+; AVX2-NEXT: LBB58_122: ## %else120
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %cl
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_124
+; AVX2-NEXT: ## BB#123: ## %cond.store121
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $13, %xmm0, 61(%rax)
+; AVX2-NEXT: LBB58_124: ## %else122
+; AVX2-NEXT: movb {{[0-9]+}}(%rsp), %dl
+; AVX2-NEXT: testb $1, %cl
+; AVX2-NEXT: je LBB58_126
+; AVX2-NEXT: ## BB#125: ## %cond.store123
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $14, %xmm0, 62(%rax)
+; AVX2-NEXT: LBB58_126: ## %else124
+; AVX2-NEXT: testb $1, %dl
+; AVX2-NEXT: je LBB58_128
+; AVX2-NEXT: ## BB#127: ## %cond.store125
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm0, 63(%rax)
+; AVX2-NEXT: LBB58_128: ## %else126
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_64xi8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vpextrb $0, %xmm4, (%rdi)
+; AVX512F-NEXT: LBB58_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrb $1, %xmm4, 1(%rdi)
+; AVX512F-NEXT: LBB58_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrb $2, %xmm4, 2(%rdi)
+; AVX512F-NEXT: LBB58_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrb $3, %xmm4, 3(%rdi)
+; AVX512F-NEXT: LBB58_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrb $4, %xmm4, 4(%rdi)
+; AVX512F-NEXT: LBB58_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrb $5, %xmm4, 5(%rdi)
+; AVX512F-NEXT: LBB58_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrb $6, %xmm4, 6(%rdi)
+; AVX512F-NEXT: LBB58_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrb $7, %xmm4, 7(%rdi)
+; AVX512F-NEXT: LBB58_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vpextrb $8, %xmm4, 8(%rdi)
+; AVX512F-NEXT: LBB58_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vpextrb $9, %xmm4, 9(%rdi)
+; AVX512F-NEXT: LBB58_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vpextrb $10, %xmm4, 10(%rdi)
+; AVX512F-NEXT: LBB58_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vpextrb $11, %xmm4, 11(%rdi)
+; AVX512F-NEXT: LBB58_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vpextrb $12, %xmm4, 12(%rdi)
+; AVX512F-NEXT: LBB58_26: ## %else24
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vpextrb $13, %xmm4, 13(%rdi)
+; AVX512F-NEXT: LBB58_28: ## %else26
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vpextrb $14, %xmm4, 14(%rdi)
+; AVX512F-NEXT: LBB58_30: ## %else28
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vpextrb $15, %xmm4, 15(%rdi)
+; AVX512F-NEXT: LBB58_32: ## %else30
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, 16(%rdi)
+; AVX512F-NEXT: LBB58_34: ## %else32
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm0, 17(%rdi)
+; AVX512F-NEXT: LBB58_36: ## %else34
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm0, 18(%rdi)
+; AVX512F-NEXT: LBB58_38: ## %else36
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm0, 19(%rdi)
+; AVX512F-NEXT: LBB58_40: ## %else38
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm0, 20(%rdi)
+; AVX512F-NEXT: LBB58_42: ## %else40
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm0, 21(%rdi)
+; AVX512F-NEXT: LBB58_44: ## %else42
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm0, 22(%rdi)
+; AVX512F-NEXT: LBB58_46: ## %else44
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm0, 23(%rdi)
+; AVX512F-NEXT: LBB58_48: ## %else46
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm0, 24(%rdi)
+; AVX512F-NEXT: LBB58_50: ## %else48
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm0, 25(%rdi)
+; AVX512F-NEXT: LBB58_52: ## %else50
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm0, 26(%rdi)
+; AVX512F-NEXT: LBB58_54: ## %else52
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm0, 27(%rdi)
+; AVX512F-NEXT: LBB58_56: ## %else54
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm0, 28(%rdi)
+; AVX512F-NEXT: LBB58_58: ## %else56
+; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1
+; AVX512F-NEXT: vpextrb $13, %xmm1, 29(%rdi)
+; AVX512F-NEXT: LBB58_60: ## %else58
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm1
+; AVX512F-NEXT: vpextrb $14, %xmm1, 30(%rdi)
+; AVX512F-NEXT: LBB58_62: ## %else60
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $0, %k1, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 31(%rdi)
+; AVX512F-NEXT: LBB58_64: ## %else62
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_66
+; AVX512F-NEXT: ## BB#65: ## %cond.store63
+; AVX512F-NEXT: vpextrb $0, %xmm5, 32(%rdi)
+; AVX512F-NEXT: LBB58_66: ## %else64
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_68
+; AVX512F-NEXT: ## BB#67: ## %cond.store65
+; AVX512F-NEXT: vpextrb $1, %xmm5, 33(%rdi)
+; AVX512F-NEXT: LBB58_68: ## %else66
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_70
+; AVX512F-NEXT: ## BB#69: ## %cond.store67
+; AVX512F-NEXT: vpextrb $2, %xmm5, 34(%rdi)
+; AVX512F-NEXT: LBB58_70: ## %else68
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_72
+; AVX512F-NEXT: ## BB#71: ## %cond.store69
+; AVX512F-NEXT: vpextrb $3, %xmm5, 35(%rdi)
+; AVX512F-NEXT: LBB58_72: ## %else70
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_74
+; AVX512F-NEXT: ## BB#73: ## %cond.store71
+; AVX512F-NEXT: vpextrb $4, %xmm5, 36(%rdi)
+; AVX512F-NEXT: LBB58_74: ## %else72
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_76
+; AVX512F-NEXT: ## BB#75: ## %cond.store73
+; AVX512F-NEXT: vpextrb $5, %xmm5, 37(%rdi)
+; AVX512F-NEXT: LBB58_76: ## %else74
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_78
+; AVX512F-NEXT: ## BB#77: ## %cond.store75
+; AVX512F-NEXT: vpextrb $6, %xmm5, 38(%rdi)
+; AVX512F-NEXT: LBB58_78: ## %else76
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_80
+; AVX512F-NEXT: ## BB#79: ## %cond.store77
+; AVX512F-NEXT: vpextrb $7, %xmm5, 39(%rdi)
+; AVX512F-NEXT: LBB58_80: ## %else78
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_82
+; AVX512F-NEXT: ## BB#81: ## %cond.store79
+; AVX512F-NEXT: vpextrb $8, %xmm5, 40(%rdi)
+; AVX512F-NEXT: LBB58_82: ## %else80
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_84
+; AVX512F-NEXT: ## BB#83: ## %cond.store81
+; AVX512F-NEXT: vpextrb $9, %xmm5, 41(%rdi)
+; AVX512F-NEXT: LBB58_84: ## %else82
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_86
+; AVX512F-NEXT: ## BB#85: ## %cond.store83
+; AVX512F-NEXT: vpextrb $10, %xmm5, 42(%rdi)
+; AVX512F-NEXT: LBB58_86: ## %else84
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_88
+; AVX512F-NEXT: ## BB#87: ## %cond.store85
+; AVX512F-NEXT: vpextrb $11, %xmm5, 43(%rdi)
+; AVX512F-NEXT: LBB58_88: ## %else86
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_90
+; AVX512F-NEXT: ## BB#89: ## %cond.store87
+; AVX512F-NEXT: vpextrb $12, %xmm5, 44(%rdi)
+; AVX512F-NEXT: LBB58_90: ## %else88
+; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_92
+; AVX512F-NEXT: ## BB#91: ## %cond.store89
+; AVX512F-NEXT: vpextrb $13, %xmm5, 45(%rdi)
+; AVX512F-NEXT: LBB58_92: ## %else90
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_94
+; AVX512F-NEXT: ## BB#93: ## %cond.store91
+; AVX512F-NEXT: vpextrb $14, %xmm5, 46(%rdi)
+; AVX512F-NEXT: LBB58_94: ## %else92
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_96
+; AVX512F-NEXT: ## BB#95: ## %cond.store93
+; AVX512F-NEXT: vpextrb $15, %xmm5, 47(%rdi)
+; AVX512F-NEXT: LBB58_96: ## %else94
+; AVX512F-NEXT: kshiftlw $15, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_98
+; AVX512F-NEXT: ## BB#97: ## %cond.store95
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, 48(%rdi)
+; AVX512F-NEXT: LBB58_98: ## %else96
+; AVX512F-NEXT: kshiftlw $14, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_100
+; AVX512F-NEXT: ## BB#99: ## %cond.store97
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm0, 49(%rdi)
+; AVX512F-NEXT: LBB58_100: ## %else98
+; AVX512F-NEXT: kshiftlw $13, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_102
+; AVX512F-NEXT: ## BB#101: ## %cond.store99
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm0, 50(%rdi)
+; AVX512F-NEXT: LBB58_102: ## %else100
+; AVX512F-NEXT: kshiftlw $12, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_104
+; AVX512F-NEXT: ## BB#103: ## %cond.store101
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm0, 51(%rdi)
+; AVX512F-NEXT: LBB58_104: ## %else102
+; AVX512F-NEXT: kshiftlw $11, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_106
+; AVX512F-NEXT: ## BB#105: ## %cond.store103
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm0, 52(%rdi)
+; AVX512F-NEXT: LBB58_106: ## %else104
+; AVX512F-NEXT: kshiftlw $10, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_108
+; AVX512F-NEXT: ## BB#107: ## %cond.store105
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm0, 53(%rdi)
+; AVX512F-NEXT: LBB58_108: ## %else106
+; AVX512F-NEXT: kshiftlw $9, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_110
+; AVX512F-NEXT: ## BB#109: ## %cond.store107
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm0, 54(%rdi)
+; AVX512F-NEXT: LBB58_110: ## %else108
+; AVX512F-NEXT: kshiftlw $8, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_112
+; AVX512F-NEXT: ## BB#111: ## %cond.store109
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm0, 55(%rdi)
+; AVX512F-NEXT: LBB58_112: ## %else110
+; AVX512F-NEXT: kshiftlw $7, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_114
+; AVX512F-NEXT: ## BB#113: ## %cond.store111
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm0, 56(%rdi)
+; AVX512F-NEXT: LBB58_114: ## %else112
+; AVX512F-NEXT: kshiftlw $6, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_116
+; AVX512F-NEXT: ## BB#115: ## %cond.store113
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm0, 57(%rdi)
+; AVX512F-NEXT: LBB58_116: ## %else114
+; AVX512F-NEXT: kshiftlw $5, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_118
+; AVX512F-NEXT: ## BB#117: ## %cond.store115
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm0, 58(%rdi)
+; AVX512F-NEXT: LBB58_118: ## %else116
+; AVX512F-NEXT: kshiftlw $4, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_120
+; AVX512F-NEXT: ## BB#119: ## %cond.store117
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm0, 59(%rdi)
+; AVX512F-NEXT: LBB58_120: ## %else118
+; AVX512F-NEXT: kshiftlw $3, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_122
+; AVX512F-NEXT: ## BB#121: ## %cond.store119
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm0, 60(%rdi)
+; AVX512F-NEXT: LBB58_122: ## %else120
+; AVX512F-NEXT: kshiftlw $2, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_124
+; AVX512F-NEXT: ## BB#123: ## %cond.store121
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $13, %xmm0, 61(%rdi)
+; AVX512F-NEXT: LBB58_124: ## %else122
+; AVX512F-NEXT: kshiftlw $1, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_126
+; AVX512F-NEXT: ## BB#125: ## %cond.store123
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $14, %xmm0, 62(%rdi)
+; AVX512F-NEXT: LBB58_126: ## %else124
+; AVX512F-NEXT: kshiftlw $0, %k1, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB58_128
+; AVX512F-NEXT: ## BB#127: ## %cond.store125
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm0, 63(%rdi)
+; AVX512F-NEXT: LBB58_128: ## %else126
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_64xi8:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x i1>)
+
+define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
+; AVX-LABEL: test_mask_store_8xi16:
+; AVX: ## BB#0:
+; AVX-NEXT: vpextrb $0, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_2
+; AVX-NEXT: ## BB#1: ## %cond.store
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: movw %ax, (%rdi)
+; AVX-NEXT: LBB59_2: ## %else
+; AVX-NEXT: vpextrb $2, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_4
+; AVX-NEXT: ## BB#3: ## %cond.store1
+; AVX-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX-NEXT: LBB59_4: ## %else2
+; AVX-NEXT: vpextrb $4, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_6
+; AVX-NEXT: ## BB#5: ## %cond.store3
+; AVX-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX-NEXT: LBB59_6: ## %else4
+; AVX-NEXT: vpextrb $6, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_8
+; AVX-NEXT: ## BB#7: ## %cond.store5
+; AVX-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX-NEXT: LBB59_8: ## %else6
+; AVX-NEXT: vpextrb $8, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_10
+; AVX-NEXT: ## BB#9: ## %cond.store7
+; AVX-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX-NEXT: LBB59_10: ## %else8
+; AVX-NEXT: vpextrb $10, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_12
+; AVX-NEXT: ## BB#11: ## %cond.store9
+; AVX-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX-NEXT: LBB59_12: ## %else10
+; AVX-NEXT: vpextrb $12, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_14
+; AVX-NEXT: ## BB#13: ## %cond.store11
+; AVX-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX-NEXT: LBB59_14: ## %else12
+; AVX-NEXT: vpextrb $14, %xmm0, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: je LBB59_16
+; AVX-NEXT: ## BB#15: ## %cond.store13
+; AVX-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX-NEXT: LBB59_16: ## %else14
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_8xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB59_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB59_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB59_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB59_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB59_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB59_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB59_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB59_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB59_16: ## %else14
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_8xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT: vpmovw2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+
+define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
+; AVX1-LABEL: test_mask_store_16xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: LBB60_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB60_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB60_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB60_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB60_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB60_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB60_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB60_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: LBB60_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $1, %xmm2, 18(%rdi)
+; AVX1-NEXT: LBB60_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $2, %xmm2, 20(%rdi)
+; AVX1-NEXT: LBB60_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $3, %xmm2, 22(%rdi)
+; AVX1-NEXT: LBB60_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $4, %xmm2, 24(%rdi)
+; AVX1-NEXT: LBB60_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $5, %xmm2, 26(%rdi)
+; AVX1-NEXT: LBB60_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrw $6, %xmm2, 28(%rdi)
+; AVX1-NEXT: LBB60_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB60_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX1-NEXT: LBB60_32: ## %else30
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_16xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: LBB60_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB60_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB60_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB60_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB60_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB60_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB60_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB60_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: LBB60_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $1, %xmm2, 18(%rdi)
+; AVX2-NEXT: LBB60_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $2, %xmm2, 20(%rdi)
+; AVX2-NEXT: LBB60_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $3, %xmm2, 22(%rdi)
+; AVX2-NEXT: LBB60_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $4, %xmm2, 24(%rdi)
+; AVX2-NEXT: LBB60_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $5, %xmm2, 26(%rdi)
+; AVX2-NEXT: LBB60_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrw $6, %xmm2, 28(%rdi)
+; AVX2-NEXT: LBB60_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB60_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX2-NEXT: LBB60_32: ## %else30
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_16xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB60_2: ## %else
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB60_4: ## %else2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB60_6: ## %else4
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB60_8: ## %else6
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB60_10: ## %else8
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB60_12: ## %else10
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB60_14: ## %else12
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB60_16: ## %else14
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: movw %ax, 16(%rdi)
+; AVX512F-NEXT: LBB60_18: ## %else16
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $1, %xmm0, 18(%rdi)
+; AVX512F-NEXT: LBB60_20: ## %else18
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $2, %xmm0, 20(%rdi)
+; AVX512F-NEXT: LBB60_22: ## %else20
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $3, %xmm0, 22(%rdi)
+; AVX512F-NEXT: LBB60_24: ## %else22
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $4, %xmm0, 24(%rdi)
+; AVX512F-NEXT: LBB60_26: ## %else24
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $5, %xmm0, 26(%rdi)
+; AVX512F-NEXT: LBB60_28: ## %else26
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $6, %xmm0, 28(%rdi)
+; AVX512F-NEXT: LBB60_30: ## %else28
+; AVX512F-NEXT: kshiftlw $0, %k0, %k0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: testb %al, %al
+; AVX512F-NEXT: je LBB60_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm0, 30(%rdi)
+; AVX512F-NEXT: LBB60_32: ## %else30
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_16xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT: vpmovb2m %xmm0, %k1
+; SKX-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>)
+
+define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
+; AVX1-LABEL: test_mask_store_32xi16:
+; AVX1: ## BB#0:
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_2
+; AVX1-NEXT: ## BB#1: ## %cond.store
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: LBB61_2: ## %else
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_4
+; AVX1-NEXT: ## BB#3: ## %cond.store1
+; AVX1-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX1-NEXT: LBB61_4: ## %else2
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_6
+; AVX1-NEXT: ## BB#5: ## %cond.store3
+; AVX1-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX1-NEXT: LBB61_6: ## %else4
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_8
+; AVX1-NEXT: ## BB#7: ## %cond.store5
+; AVX1-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX1-NEXT: LBB61_8: ## %else6
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_10
+; AVX1-NEXT: ## BB#9: ## %cond.store7
+; AVX1-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX1-NEXT: LBB61_10: ## %else8
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_12
+; AVX1-NEXT: ## BB#11: ## %cond.store9
+; AVX1-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX1-NEXT: LBB61_12: ## %else10
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_14
+; AVX1-NEXT: ## BB#13: ## %cond.store11
+; AVX1-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX1-NEXT: LBB61_14: ## %else12
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_16
+; AVX1-NEXT: ## BB#15: ## %cond.store13
+; AVX1-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX1-NEXT: LBB61_16: ## %else14
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_18
+; AVX1-NEXT: ## BB#17: ## %cond.store15
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: LBB61_18: ## %else16
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_20
+; AVX1-NEXT: ## BB#19: ## %cond.store17
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX1-NEXT: LBB61_20: ## %else18
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_22
+; AVX1-NEXT: ## BB#21: ## %cond.store19
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX1-NEXT: LBB61_22: ## %else20
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_24
+; AVX1-NEXT: ## BB#23: ## %cond.store21
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX1-NEXT: LBB61_24: ## %else22
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_26
+; AVX1-NEXT: ## BB#25: ## %cond.store23
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX1-NEXT: LBB61_26: ## %else24
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_28
+; AVX1-NEXT: ## BB#27: ## %cond.store25
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX1-NEXT: LBB61_28: ## %else26
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_30
+; AVX1-NEXT: ## BB#29: ## %cond.store27
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX1-NEXT: LBB61_30: ## %else28
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_32
+; AVX1-NEXT: ## BB#31: ## %cond.store29
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX1-NEXT: LBB61_32: ## %else30
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_34
+; AVX1-NEXT: ## BB#33: ## %cond.store31
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 32(%rdi)
+; AVX1-NEXT: LBB61_34: ## %else32
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_36
+; AVX1-NEXT: ## BB#35: ## %cond.store33
+; AVX1-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX1-NEXT: LBB61_36: ## %else34
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_38
+; AVX1-NEXT: ## BB#37: ## %cond.store35
+; AVX1-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX1-NEXT: LBB61_38: ## %else36
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_40
+; AVX1-NEXT: ## BB#39: ## %cond.store37
+; AVX1-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX1-NEXT: LBB61_40: ## %else38
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_42
+; AVX1-NEXT: ## BB#41: ## %cond.store39
+; AVX1-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX1-NEXT: LBB61_42: ## %else40
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_44
+; AVX1-NEXT: ## BB#43: ## %cond.store41
+; AVX1-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX1-NEXT: LBB61_44: ## %else42
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_46
+; AVX1-NEXT: ## BB#45: ## %cond.store43
+; AVX1-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX1-NEXT: LBB61_46: ## %else44
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_48
+; AVX1-NEXT: ## BB#47: ## %cond.store45
+; AVX1-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX1-NEXT: LBB61_48: ## %else46
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_50
+; AVX1-NEXT: ## BB#49: ## %cond.store47
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, 48(%rdi)
+; AVX1-NEXT: LBB61_50: ## %else48
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_52
+; AVX1-NEXT: ## BB#51: ## %cond.store49
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX1-NEXT: LBB61_52: ## %else50
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_54
+; AVX1-NEXT: ## BB#53: ## %cond.store51
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX1-NEXT: LBB61_54: ## %else52
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_56
+; AVX1-NEXT: ## BB#55: ## %cond.store53
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX1-NEXT: LBB61_56: ## %else54
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_58
+; AVX1-NEXT: ## BB#57: ## %cond.store55
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX1-NEXT: LBB61_58: ## %else56
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_60
+; AVX1-NEXT: ## BB#59: ## %cond.store57
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX1-NEXT: LBB61_60: ## %else58
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_62
+; AVX1-NEXT: ## BB#61: ## %cond.store59
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX1-NEXT: LBB61_62: ## %else60
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB61_64
+; AVX1-NEXT: ## BB#63: ## %cond.store61
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX1-NEXT: LBB61_64: ## %else62
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mask_store_32xi16:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_2
+; AVX2-NEXT: ## BB#1: ## %cond.store
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: LBB61_2: ## %else
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_4
+; AVX2-NEXT: ## BB#3: ## %cond.store1
+; AVX2-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX2-NEXT: LBB61_4: ## %else2
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_6
+; AVX2-NEXT: ## BB#5: ## %cond.store3
+; AVX2-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX2-NEXT: LBB61_6: ## %else4
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_8
+; AVX2-NEXT: ## BB#7: ## %cond.store5
+; AVX2-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX2-NEXT: LBB61_8: ## %else6
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_10
+; AVX2-NEXT: ## BB#9: ## %cond.store7
+; AVX2-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX2-NEXT: LBB61_10: ## %else8
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_12
+; AVX2-NEXT: ## BB#11: ## %cond.store9
+; AVX2-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX2-NEXT: LBB61_12: ## %else10
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_14
+; AVX2-NEXT: ## BB#13: ## %cond.store11
+; AVX2-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX2-NEXT: LBB61_14: ## %else12
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_16
+; AVX2-NEXT: ## BB#15: ## %cond.store13
+; AVX2-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX2-NEXT: LBB61_16: ## %else14
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_18
+; AVX2-NEXT: ## BB#17: ## %cond.store15
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: LBB61_18: ## %else16
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_20
+; AVX2-NEXT: ## BB#19: ## %cond.store17
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX2-NEXT: LBB61_20: ## %else18
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_22
+; AVX2-NEXT: ## BB#21: ## %cond.store19
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX2-NEXT: LBB61_22: ## %else20
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_24
+; AVX2-NEXT: ## BB#23: ## %cond.store21
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX2-NEXT: LBB61_24: ## %else22
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_26
+; AVX2-NEXT: ## BB#25: ## %cond.store23
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX2-NEXT: LBB61_26: ## %else24
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_28
+; AVX2-NEXT: ## BB#27: ## %cond.store25
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX2-NEXT: LBB61_28: ## %else26
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_30
+; AVX2-NEXT: ## BB#29: ## %cond.store27
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX2-NEXT: LBB61_30: ## %else28
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_32
+; AVX2-NEXT: ## BB#31: ## %cond.store29
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX2-NEXT: LBB61_32: ## %else30
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_34
+; AVX2-NEXT: ## BB#33: ## %cond.store31
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 32(%rdi)
+; AVX2-NEXT: LBB61_34: ## %else32
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_36
+; AVX2-NEXT: ## BB#35: ## %cond.store33
+; AVX2-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX2-NEXT: LBB61_36: ## %else34
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_38
+; AVX2-NEXT: ## BB#37: ## %cond.store35
+; AVX2-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX2-NEXT: LBB61_38: ## %else36
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_40
+; AVX2-NEXT: ## BB#39: ## %cond.store37
+; AVX2-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX2-NEXT: LBB61_40: ## %else38
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_42
+; AVX2-NEXT: ## BB#41: ## %cond.store39
+; AVX2-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX2-NEXT: LBB61_42: ## %else40
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_44
+; AVX2-NEXT: ## BB#43: ## %cond.store41
+; AVX2-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX2-NEXT: LBB61_44: ## %else42
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_46
+; AVX2-NEXT: ## BB#45: ## %cond.store43
+; AVX2-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX2-NEXT: LBB61_46: ## %else44
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_48
+; AVX2-NEXT: ## BB#47: ## %cond.store45
+; AVX2-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX2-NEXT: LBB61_48: ## %else46
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_50
+; AVX2-NEXT: ## BB#49: ## %cond.store47
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, 48(%rdi)
+; AVX2-NEXT: LBB61_50: ## %else48
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_52
+; AVX2-NEXT: ## BB#51: ## %cond.store49
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX2-NEXT: LBB61_52: ## %else50
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_54
+; AVX2-NEXT: ## BB#53: ## %cond.store51
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX2-NEXT: LBB61_54: ## %else52
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_56
+; AVX2-NEXT: ## BB#55: ## %cond.store53
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX2-NEXT: LBB61_56: ## %else54
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_58
+; AVX2-NEXT: ## BB#57: ## %cond.store55
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX2-NEXT: LBB61_58: ## %else56
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_60
+; AVX2-NEXT: ## BB#59: ## %cond.store57
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX2-NEXT: LBB61_60: ## %else58
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_62
+; AVX2-NEXT: ## BB#61: ## %cond.store59
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX2-NEXT: LBB61_62: ## %else60
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: je LBB61_64
+; AVX2-NEXT: ## BB#63: ## %cond.store61
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX2-NEXT: LBB61_64: ## %else62
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_mask_store_32xi16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_2
+; AVX512F-NEXT: ## BB#1: ## %cond.store
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, (%rdi)
+; AVX512F-NEXT: LBB61_2: ## %else
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_4
+; AVX512F-NEXT: ## BB#3: ## %cond.store1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 2(%rdi)
+; AVX512F-NEXT: LBB61_4: ## %else2
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_6
+; AVX512F-NEXT: ## BB#5: ## %cond.store3
+; AVX512F-NEXT: vpextrw $2, %xmm1, 4(%rdi)
+; AVX512F-NEXT: LBB61_6: ## %else4
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_8
+; AVX512F-NEXT: ## BB#7: ## %cond.store5
+; AVX512F-NEXT: vpextrw $3, %xmm1, 6(%rdi)
+; AVX512F-NEXT: LBB61_8: ## %else6
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_10
+; AVX512F-NEXT: ## BB#9: ## %cond.store7
+; AVX512F-NEXT: vpextrw $4, %xmm1, 8(%rdi)
+; AVX512F-NEXT: LBB61_10: ## %else8
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_12
+; AVX512F-NEXT: ## BB#11: ## %cond.store9
+; AVX512F-NEXT: vpextrw $5, %xmm1, 10(%rdi)
+; AVX512F-NEXT: LBB61_12: ## %else10
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_14
+; AVX512F-NEXT: ## BB#13: ## %cond.store11
+; AVX512F-NEXT: vpextrw $6, %xmm1, 12(%rdi)
+; AVX512F-NEXT: LBB61_14: ## %else12
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_16
+; AVX512F-NEXT: ## BB#15: ## %cond.store13
+; AVX512F-NEXT: vpextrw $7, %xmm1, 14(%rdi)
+; AVX512F-NEXT: LBB61_16: ## %else14
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_18
+; AVX512F-NEXT: ## BB#17: ## %cond.store15
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vmovd %xmm3, %eax
+; AVX512F-NEXT: movw %ax, 16(%rdi)
+; AVX512F-NEXT: LBB61_18: ## %else16
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_20
+; AVX512F-NEXT: ## BB#19: ## %cond.store17
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $1, %xmm3, 18(%rdi)
+; AVX512F-NEXT: LBB61_20: ## %else18
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_22
+; AVX512F-NEXT: ## BB#21: ## %cond.store19
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $2, %xmm3, 20(%rdi)
+; AVX512F-NEXT: LBB61_22: ## %else20
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_24
+; AVX512F-NEXT: ## BB#23: ## %cond.store21
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $3, %xmm3, 22(%rdi)
+; AVX512F-NEXT: LBB61_24: ## %else22
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_26
+; AVX512F-NEXT: ## BB#25: ## %cond.store23
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $4, %xmm3, 24(%rdi)
+; AVX512F-NEXT: LBB61_26: ## %else24
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_28
+; AVX512F-NEXT: ## BB#27: ## %cond.store25
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $5, %xmm3, 26(%rdi)
+; AVX512F-NEXT: LBB61_28: ## %else26
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_30
+; AVX512F-NEXT: ## BB#29: ## %cond.store27
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpextrw $6, %xmm3, 28(%rdi)
+; AVX512F-NEXT: LBB61_30: ## %else28
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_32
+; AVX512F-NEXT: ## BB#31: ## %cond.store29
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpextrw $7, %xmm1, 30(%rdi)
+; AVX512F-NEXT: LBB61_32: ## %else30
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_34
+; AVX512F-NEXT: ## BB#33: ## %cond.store31
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: movw %ax, 32(%rdi)
+; AVX512F-NEXT: LBB61_34: ## %else32
+; AVX512F-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_36
+; AVX512F-NEXT: ## BB#35: ## %cond.store33
+; AVX512F-NEXT: vpextrw $1, %xmm2, 34(%rdi)
+; AVX512F-NEXT: LBB61_36: ## %else34
+; AVX512F-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_38
+; AVX512F-NEXT: ## BB#37: ## %cond.store35
+; AVX512F-NEXT: vpextrw $2, %xmm2, 36(%rdi)
+; AVX512F-NEXT: LBB61_38: ## %else36
+; AVX512F-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_40
+; AVX512F-NEXT: ## BB#39: ## %cond.store37
+; AVX512F-NEXT: vpextrw $3, %xmm2, 38(%rdi)
+; AVX512F-NEXT: LBB61_40: ## %else38
+; AVX512F-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_42
+; AVX512F-NEXT: ## BB#41: ## %cond.store39
+; AVX512F-NEXT: vpextrw $4, %xmm2, 40(%rdi)
+; AVX512F-NEXT: LBB61_42: ## %else40
+; AVX512F-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_44
+; AVX512F-NEXT: ## BB#43: ## %cond.store41
+; AVX512F-NEXT: vpextrw $5, %xmm2, 42(%rdi)
+; AVX512F-NEXT: LBB61_44: ## %else42
+; AVX512F-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_46
+; AVX512F-NEXT: ## BB#45: ## %cond.store43
+; AVX512F-NEXT: vpextrw $6, %xmm2, 44(%rdi)
+; AVX512F-NEXT: LBB61_46: ## %else44
+; AVX512F-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_48
+; AVX512F-NEXT: ## BB#47: ## %cond.store45
+; AVX512F-NEXT: vpextrw $7, %xmm2, 46(%rdi)
+; AVX512F-NEXT: LBB61_48: ## %else46
+; AVX512F-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_50
+; AVX512F-NEXT: ## BB#49: ## %cond.store47
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: movw %ax, 48(%rdi)
+; AVX512F-NEXT: LBB61_50: ## %else48
+; AVX512F-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_52
+; AVX512F-NEXT: ## BB#51: ## %cond.store49
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $1, %xmm1, 50(%rdi)
+; AVX512F-NEXT: LBB61_52: ## %else50
+; AVX512F-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_54
+; AVX512F-NEXT: ## BB#53: ## %cond.store51
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $2, %xmm1, 52(%rdi)
+; AVX512F-NEXT: LBB61_54: ## %else52
+; AVX512F-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_56
+; AVX512F-NEXT: ## BB#55: ## %cond.store53
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $3, %xmm1, 54(%rdi)
+; AVX512F-NEXT: LBB61_56: ## %else54
+; AVX512F-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_58
+; AVX512F-NEXT: ## BB#57: ## %cond.store55
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $4, %xmm1, 56(%rdi)
+; AVX512F-NEXT: LBB61_58: ## %else56
+; AVX512F-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_60
+; AVX512F-NEXT: ## BB#59: ## %cond.store57
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $5, %xmm1, 58(%rdi)
+; AVX512F-NEXT: LBB61_60: ## %else58
+; AVX512F-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_62
+; AVX512F-NEXT: ## BB#61: ## %cond.store59
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512F-NEXT: vpextrw $6, %xmm1, 60(%rdi)
+; AVX512F-NEXT: LBB61_62: ## %else60
+; AVX512F-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512F-NEXT: testb $1, %al
+; AVX512F-NEXT: je LBB61_64
+; AVX512F-NEXT: ## BB#63: ## %cond.store61
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm0, 62(%rdi)
+; AVX512F-NEXT: LBB61_64: ## %else62
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: test_mask_store_32xi16:
+; SKX: ## BB#0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT: vpmovb2m %ymm0, %k1
+; SKX-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
+; SKX-NEXT: retq
+ call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
+ ret void
+}
+
+declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>)
diff --git a/test/CodeGen/X86/materialize-one.ll b/test/CodeGen/X86/materialize-one.ll
deleted file mode 100644
index 49da8008b88c..000000000000
--- a/test/CodeGen/X86/materialize-one.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
-
-define i32 @one32() optsize {
-entry:
- ret i32 1
-
-; CHECK32-LABEL: one32
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: incl %eax
-; CHECK32-NEXT: ret
-
-; FIXME: Figure out the best approach in 64-bit mode.
-; CHECK64-LABEL: one32
-; CHECK64: movl $1, %eax
-; CHECK64-NEXT: retq
-}
-
-define i32 @minus_one32() optsize {
-entry:
- ret i32 -1
-
-; CHECK32-LABEL: minus_one32
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NEXT: ret
-}
-
-define i16 @one16() optsize {
-entry:
- ret i16 1
-
-; CHECK32-LABEL: one16
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: incl %eax
-; CHECK32-NEXT: retl
-}
-
-define i16 @minus_one16() optsize {
-entry:
- ret i16 -1
-
-; CHECK32-LABEL: minus_one16
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NEXT: retl
-}
-
-define i32 @test_rematerialization() optsize {
-entry:
- ; Materialize -1 (thiscall forces it into %ecx).
- tail call x86_thiscallcc void @f(i32 -1)
-
- ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
- ; spilling it to the stack.
- tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
- ; -1 should be re-materialized here instead of getting spilled above.
- ret i32 -1
-
-; CHECK32-LABEL: test_rematerialization
-; CHECK32: xorl %ecx, %ecx
-; CHECK32-NEXT: decl %ecx
-; CHECK32: calll
-; CHECK32: xorl %eax, %eax
-; CHECK32-NEXT: decl %eax
-; CHECK32-NOT: %eax
-; CHECK32: retl
-}
-
-define i32 @test_rematerialization2(i32 %x) optsize {
-entry:
- ; Materialize -1 (thiscall forces it into %ecx).
- tail call x86_thiscallcc void @f(i32 -1)
-
- ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
- ; spilling it to the stack.
- tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
-
- ; Define eflags.
- %a = icmp ne i32 %x, 123
- %b = zext i1 %a to i32
- ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
- ; It must therefore not use the xor-dec lowering.
- %c = select i1 %a, i32 %b, i32 -1
- ret i32 %c
-
-; CHECK32-LABEL: test_rematerialization2
-; CHECK32: xorl %ecx, %ecx
-; CHECK32-NEXT: decl %ecx
-; CHECK32: calll
-; CHECK32: cmpl
-; CHECK32: setne
-; CHECK32-NOT: xorl
-; CHECK32: movl $-1
-; CHECK32: cmov
-; CHECK32: retl
-}
-
-declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/materialize.ll b/test/CodeGen/X86/materialize.ll
new file mode 100644
index 000000000000..6e1264b4fd43
--- /dev/null
+++ b/test/CodeGen/X86/materialize.ll
@@ -0,0 +1,216 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
+; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
+
+define i32 @one32_nooptsize() {
+entry:
+ ret i32 1
+
+; When not optimizing for size, use mov.
+; CHECK32-LABEL: one32_nooptsize:
+; CHECK32: movl $1, %eax
+; CHECK32-NEXT: retl
+; CHECK64-LABEL: one32_nooptsize:
+; CHECK64: movl $1, %eax
+; CHECK64-NEXT: retq
+}
+
+define i32 @one32() optsize {
+entry:
+ ret i32 1
+
+; CHECK32-LABEL: one32:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: retl
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32:
+; CHECK64: movl $1, %eax
+; CHECK64-NEXT: retq
+}
+
+define i32 @one32_minsize() minsize {
+entry:
+ ret i32 1
+
+; On 32-bit, xor-inc is preferred over push-pop.
+; CHECK32-LABEL: one32_minsize:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: retl
+
+; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
+; pop into a 64-bit register even when we just need 32 bits.
+; CHECK64-LABEL: one32_minsize:
+; CHECK64: pushq $1
+; CHECK64: .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64: .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT: retq
+
+; On Win64 we can't adjust the stack unless there's a frame pointer.
+; CHECKWIN64-LABEL: one32_minsize:
+; CHECKWIN64: movl $1, %eax
+; CHECKWIN64-NEXT: retq
+}
+
+define i32 @pr26023() minsize {
+entry:
+ %x = alloca [120 x i8]
+ %0 = getelementptr inbounds [120 x i8], [120 x i8]* %x, i64 0, i64 0
+ call void asm sideeffect "", "imr,~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %0)
+ %arrayidx = getelementptr inbounds [120 x i8], [120 x i8]* %x, i64 0, i64 119
+ store volatile i8 -2, i8* %arrayidx
+ call void asm sideeffect "", "r,~{dirflag},~{fpsr},~{flags}"(i32 5)
+ %1 = load volatile i8, i8* %arrayidx
+ %conv = sext i8 %1 to i32
+ ret i32 %conv
+
+; The function writes to the redzone, so push/pop cannot be used.
+; CHECK64-LABEL: pr26023:
+; CHECK64: movl $5, %ecx
+; CHECK64: retq
+
+; 32-bit X86 doesn't have a redzone.
+; CHECK32-LABEL: pr26023:
+; CHECK32: pushl $5
+; CHECK32: popl %ecx
+; CHECK32: retl
+}
+
+
+define i64 @one64_minsize() minsize {
+entry:
+ ret i64 1
+; On 64-bit we don't do xor-inc yet, so push-pop it is.
+; CHECK64-LABEL: one64_minsize:
+; CHECK64: pushq $1
+; CHECK64: .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64: .cfi_adjust_cfa_offset -8
+; CHECK64-NEXT: retq
+
+; On Win64 we can't adjust the stack unless there's a frame pointer.
+; CHECKWIN64-LABEL: one64_minsize:
+; CHECKWIN64: movl $1, %eax
+; CHECKWIN64-NEXT: retq
+}
+
+define i32 @minus_one32() optsize {
+entry:
+ ret i32 -1
+
+; CHECK32-LABEL: minus_one32:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: retl
+}
+
+define i32 @minus_one32_minsize() minsize {
+entry:
+ ret i32 -1
+
+; xor-dec is preferred over push-pop.
+; CHECK32-LABEL: minus_one32_minsize:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: retl
+}
+
+define i16 @one16() optsize {
+entry:
+ ret i16 1
+
+; CHECK32-LABEL: one16:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: incl %eax
+; CHECK32-NEXT: # kill
+; CHECK32-NEXT: retl
+}
+
+define i16 @minus_one16() optsize {
+entry:
+ ret i16 -1
+
+; CHECK32-LABEL: minus_one16:
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NEXT: # kill
+; CHECK32-NEXT: retl
+}
+
+define i32 @minus_five32() minsize {
+entry:
+ ret i32 -5
+
+; CHECK32-LABEL: minus_five32:
+; CHECK32: pushl $-5
+; CHECK32: popl %eax
+; CHECK32: retl
+}
+
+define i64 @minus_five64() minsize {
+entry:
+ ret i64 -5
+
+; CHECK64-LABEL: minus_five64:
+; CHECK64: pushq $-5
+; CHECK64: .cfi_adjust_cfa_offset 8
+; CHECK64: popq %rax
+; CHECK64: .cfi_adjust_cfa_offset -8
+; CHECK64: retq
+}
+
+define i32 @rematerialize_minus_one() optsize {
+entry:
+ ; Materialize -1 (thiscall forces it into %ecx).
+ tail call x86_thiscallcc void @f(i32 -1)
+
+ ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+ ; spilling it to the stack.
+ tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+ ; -1 should be re-materialized here instead of getting spilled above.
+ ret i32 -1
+
+; CHECK32-LABEL: rematerialize_minus_one
+; CHECK32: xorl %ecx, %ecx
+; CHECK32-NEXT: decl %ecx
+; CHECK32: calll
+; CHECK32: xorl %eax, %eax
+; CHECK32-NEXT: decl %eax
+; CHECK32-NOT: %eax
+; CHECK32: retl
+}
+
+define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
+entry:
+ ; Materialize -1 (thiscall forces it into %ecx).
+ tail call x86_thiscallcc void @f(i32 -1)
+
+ ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+ ; spilling it to the stack.
+ tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+ ; Define eflags.
+ %a = icmp ne i32 %x, 123
+ %b = zext i1 %a to i32
+ ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+ ; It must therefore not use the xor-dec lowering.
+ %c = select i1 %a, i32 %b, i32 -1
+ ret i32 %c
+
+; CHECK32-LABEL: rematerialize_minus_one_eflags
+; CHECK32: xorl %ecx, %ecx
+; CHECK32-NEXT: decl %ecx
+; CHECK32: calll
+; CHECK32: cmpl
+; CHECK32: setne
+; CHECK32-NOT: xorl
+; CHECK32: movl $-1
+; CHECK32: cmov
+; CHECK32: retl
+}
+
+declare x86_thiscallcc void @f(i32)
diff --git a/test/CodeGen/X86/mbp-false-cfg-break.ll b/test/CodeGen/X86/mbp-false-cfg-break.ll
new file mode 100644
index 000000000000..bc8b0de3eef0
--- /dev/null
+++ b/test/CodeGen/X86/mbp-false-cfg-break.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define void @test(i1 %cnd) !prof !{!"function_entry_count", i64 1024} {
+; CHECK-LABEL: @test
+; Using the assembly comments to indicate block order..
+; CHECK: # %loop
+; CHECK: # %backedge
+; CHECK: # %exit
+; CHECK: # %rare
+; CHECK: # %rare.1
+
+ br i1 undef, label %rare.1, label %preheader, !prof !{!"branch_weights", i32 0, i32 1000}
+rare.1:
+ call void @foo()
+ br label %preheader
+
+preheader:
+ br label %loop
+
+loop:
+ %iv = phi i32 [0, %preheader], [%iv.next, %backedge]
+ call void @foo()
+ br i1 %cnd, label %backedge, label %rare, !prof !{!"branch_weights", i32 1000000, i32 1}
+rare:
+ call void @foo()
+ br label %backedge
+backedge:
+ call void @foo()
+ %iv.next = add i32 %iv, 1
+ %cmp = icmp eq i32 %iv.next, 200
+ br i1 %cmp, label %loop, label %exit, !prof !{!"branch_weights", i32 1000, i32 1}
+
+exit:
+ ret void
+
+}
+
+
+declare void @foo()
diff --git a/test/CodeGen/X86/mcinst-lowering.ll b/test/CodeGen/X86/mcinst-lowering.ll
index 51b2895f1c78..7b16d7616fe5 100644
--- a/test/CodeGen/X86/mcinst-lowering.ll
+++ b/test/CodeGen/X86/mcinst-lowering.ll
@@ -3,26 +3,17 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
+declare i32 @foo();
+
define i32 @f0(i32* nocapture %x) nounwind readonly ssp {
entry:
- %tmp1 = load i32, i32* %x ; <i32> [#uses=2]
- %tobool = icmp eq i32 %tmp1, 0 ; <i1> [#uses=1]
- br i1 %tobool, label %if.end, label %return
-
-if.end: ; preds = %entry
-
-; Check that we lower to the short form of cmpl, which has a fixed %eax
-; register.
-;
+ %tmp1 = call i32 @foo()
; CHECK: cmpl $16777216, %eax
; CHECK: # encoding: [0x3d,0x00,0x00,0x00,0x01]
%cmp = icmp eq i32 %tmp1, 16777216 ; <i1> [#uses=1]
%conv = zext i1 %cmp to i32 ; <i32> [#uses=1]
ret i32 %conv
-
-return: ; preds = %entry
- ret i32 0
}
define i32 @f1() nounwind {
diff --git a/test/CodeGen/X86/mcu-abi.ll b/test/CodeGen/X86/mcu-abi.ll
index 966fd4521f2d..1cc277c863f0 100644
--- a/test/CodeGen/X86/mcu-abi.ll
+++ b/test/CodeGen/X86/mcu-abi.ll
@@ -82,6 +82,8 @@ entry:
ret i32 %i1
}
+%struct.S = type { i8 }
+
; CHECK-LABEL: test_lib_args:
; CHECK: movl %edx, %eax
; CHECK: calll __fixsfsi
@@ -91,14 +93,10 @@ define i32 @test_lib_args(float %a, float %b) #0 {
}
; CHECK-LABEL: test_fp128:
-; CHECK: movl (%eax), %e[[CX:..]]
-; CHECK-NEXT: movl 4(%eax), %e[[DX:..]]
-; CHECK-NEXT: movl 8(%eax), %e[[SI:..]]
-; CHECK-NEXT: movl 12(%eax), %e[[AX:..]]
-; CHECK-NEXT: movl %e[[AX]], 12(%esp)
-; CHECK-NEXT: movl %e[[SI]], 8(%esp)
-; CHECK-NEXT: movl %e[[DX]], 4(%esp)
-; CHECK-NEXT: movl %e[[CX]], (%esp)
+; CHECK: pushl 12(%eax)
+; CHECK-NEXT: pushl 8(%eax)
+; CHECK-NEXT: pushl 4(%eax)
+; CHECK-NEXT: pushl (%eax)
; CHECK-NEXT: calll __fixtfsi
define i32 @test_fp128(fp128* %ptr) #0 {
%v = load fp128, fp128* %ptr
@@ -108,5 +106,50 @@ define i32 @test_fp128(fp128* %ptr) #0 {
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+; CHECK-LABEL: test_alignment_d:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_d() #0 {
+entry:
+ %d = alloca double
+ store double 2.000000e+00, double* %d
+ call void @food(double* inreg %d)
+ ret void
+}
+
+; CHECK-LABEL: test_alignment_i:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_i() #0 {
+entry:
+ %i = alloca i64
+ store i64 2, i64* %i
+ call void @fooi(i64* inreg %i)
+ ret void
+}
+
+
+; CHECK-LABEL: test_alignment_s:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_s() #0 {
+ %s = alloca %struct.S, align 4
+ call void @foos(%struct.S* inreg %s)
+ ret void
+}
+
+
+; CHECK-LABEL: test_alignment_fp:
+; CHECK-NOT: andl {{.+}}, %esp
+define void @test_alignment_fp() #0 {
+entry:
+ %f = alloca fp128
+ store fp128 0xL00000000000000004000000000000000, fp128* %f
+ call void @foofp(fp128* inreg %f)
+ ret void
+}
+
+declare void @food(double* inreg)
+declare void @fooi(i64* inreg)
+declare void @foos(%struct.S* inreg)
+declare void @foofp(fp128* inreg)
+
attributes #0 = { nounwind "use-soft-float"="true"}
attributes #1 = { nounwind argmemonly }
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index e5f1f526b467..6a51d60f636c 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -23,8 +23,7 @@ return: ; preds = %entry
ret void
; CHECK-LABEL: memcmp2:
; CHECK: movzwl
-; CHECK-NEXT: movzwl
-; CHECK-NEXT: cmpl
+; CHECK-NEXT: cmpw
; NOBUILTIN-LABEL: memcmp2:
; NOBUILTIN: callq
}
@@ -46,6 +45,21 @@ return: ; preds = %entry
; CHECK-NEXT: cmpl $28527,
}
+define void @memcmp2nb(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+entry:
+ %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin ; <i32> [#uses=1]
+ %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1]
+ br i1 %1, label %return, label %bb
+
+bb: ; preds = %entry
+ store i32 4, i32* %P, align 4
+ ret void
+
+return: ; preds = %entry
+ ret void
+; CHECK-LABEL: memcmp2nb:
+; CHECK: callq
+}
define void @memcmp4(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
entry:
diff --git a/test/CodeGen/X86/memcpy-from-string.ll b/test/CodeGen/X86/memcpy-from-string.ll
new file mode 100644
index 000000000000..d62d9e20254a
--- /dev/null
+++ b/test/CodeGen/X86/memcpy-from-string.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%0 = type { %1, i64, %2 }
+%1 = type { i8* }
+%2 = type { i64, [8 x i8] }
+
+@0 = internal constant [10 x i8] c"asdf jkl;\00", align 1
+
+; Memcpy lowering should emit stores of immediates containing string data from
+; the correct offsets.
+
+; CHECK-LABEL: foo:
+; CHECK: movb $0, 6(%rdi)
+; CHECK: movw $15212, 4(%rdi)
+; CHECK: movl $1802117222, (%rdi)
+define void @foo(i8* %tmp2) {
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @0, i64 0, i64 3), i64 7, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index a87ef2e15a5a..a02ef29ca6b3 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -1,19 +1,30 @@
-; RUN: llc -mtriple=i386-apple-darwin -mcpu=yonah < %s | FileCheck %s
-
-declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s | FileCheck %s
define fastcc void @t1() nounwind {
-entry:
; CHECK-LABEL: t1:
-; CHECK: calll L_memset$stub
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: subl $16, %esp
+; CHECK-NEXT: pushl $188
+; CHECK-NEXT: pushl $0
+; CHECK-NEXT: pushl $0
+; CHECK-NEXT: calll _memset
+; CHECK-NEXT: addl $16, %esp
+;
+entry:
call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i32 1, i1 false)
unreachable
}
define fastcc void @t2(i8 signext %c) nounwind {
-entry:
; CHECK-LABEL: t2:
-; CHECK: calll L_memset$stub
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp)
+; CHECK-NEXT: calll _memset
+;
+entry:
call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i32 1, i1 false)
unreachable
}
@@ -21,19 +32,34 @@ entry:
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
define void @t3(i8* nocapture %s, i8 %a) nounwind {
+; CHECK-LABEL: t3:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101
+; CHECK-NEXT: movl %ecx, 4(%eax)
+; CHECK-NEXT: movl %ecx, (%eax)
+; CHECK-NEXT: retl
+;
entry:
tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 8, i32 1, i1 false)
ret void
-; CHECK-LABEL: t3:
-; CHECK: imull $16843009
}
define void @t4(i8* nocapture %s, i8 %a) nounwind {
+; CHECK-LABEL: t4:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101
+; CHECK-NEXT: movl %ecx, 8(%eax)
+; CHECK-NEXT: movl %ecx, 4(%eax)
+; CHECK-NEXT: movl %ecx, (%eax)
+; CHECK-NEXT: movw %cx, 12(%eax)
+; CHECK-NEXT: movb %cl, 14(%eax)
+; CHECK-NEXT: retl
+;
entry:
tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i32 1, i1 false)
ret void
-; CHECK-LABEL: t4:
-; CHECK: imull $16843009
-; CHECK-NOT: imul
-; CHECK: ret
}
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
new file mode 100644
index 000000000000..29fee0710405
--- /dev/null
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -0,0 +1,470 @@
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; https://llvm.org/bugs/show_bug.cgi?id=27100
+
+define void @memset_16_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_16_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_16_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_16_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
+ ret void
+}
+
+define void @memset_32_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_32_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 24(%rdi)
+; SSE-NEXT: movq %rax, 16(%rdi)
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_32_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_32_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
+ ret void
+}
+
+define void @memset_64_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_64_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 56(%rdi)
+; SSE-NEXT: movq %rax, 48(%rdi)
+; SSE-NEXT: movq %rax, 40(%rdi)
+; SSE-NEXT: movq %rax, 32(%rdi)
+; SSE-NEXT: movq %rax, 24(%rdi)
+; SSE-NEXT: movq %rax, 16(%rdi)
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_64_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_64_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
+ ret void
+}
+
+define void @memset_128_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_128_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
+; SSE-NEXT: movq %rax, 120(%rdi)
+; SSE-NEXT: movq %rax, 112(%rdi)
+; SSE-NEXT: movq %rax, 104(%rdi)
+; SSE-NEXT: movq %rax, 96(%rdi)
+; SSE-NEXT: movq %rax, 88(%rdi)
+; SSE-NEXT: movq %rax, 80(%rdi)
+; SSE-NEXT: movq %rax, 72(%rdi)
+; SSE-NEXT: movq %rax, 64(%rdi)
+; SSE-NEXT: movq %rax, 56(%rdi)
+; SSE-NEXT: movq %rax, 48(%rdi)
+; SSE-NEXT: movq %rax, 40(%rdi)
+; SSE-NEXT: movq %rax, 32(%rdi)
+; SSE-NEXT: movq %rax, 24(%rdi)
+; SSE-NEXT: movq %rax, 16(%rdi)
+; SSE-NEXT: movq %rax, 8(%rdi)
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_128_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_128_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
+ ret void
+}
+
+define void @memset_256_nonzero_bytes(i8* %x) {
+; SSE-LABEL: memset_256_nonzero_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: .Ltmp0:
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: movl $42, %esi
+; SSE-NEXT: movl $256, %edx # imm = 0x100
+; SSE-NEXT: callq memset
+; SSE-NEXT: popq %rax
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_256_nonzero_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; SSE2FAST-NEXT: movups %xmm0, 240(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 224(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 208(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 192(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 176(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 160(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 144(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 128(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movups %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX-LABEL: memset_256_nonzero_bytes:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
+; AVX-NEXT: vmovups %ymm0, 224(%rdi)
+; AVX-NEXT: vmovups %ymm0, 192(%rdi)
+; AVX-NEXT: vmovups %ymm0, 160(%rdi)
+; AVX-NEXT: vmovups %ymm0, 128(%rdi)
+; AVX-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+ %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
+ ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+; Repeat with a non-constant value for the stores.
+
+define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_16_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_16_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_16_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_16_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_32_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 24(%rdi)
+; SSE-NEXT: movq %rcx, 16(%rdi)
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_32_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_32_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_32_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_64_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 56(%rdi)
+; SSE-NEXT: movq %rcx, 48(%rdi)
+; SSE-NEXT: movq %rcx, 40(%rdi)
+; SSE-NEXT: movq %rcx, 32(%rdi)
+; SSE-NEXT: movq %rcx, 24(%rdi)
+; SSE-NEXT: movq %rcx, 16(%rdi)
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_64_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_64_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_64_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_128_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movzbl %sil, %eax
+; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE-NEXT: imulq %rax, %rcx
+; SSE-NEXT: movq %rcx, 120(%rdi)
+; SSE-NEXT: movq %rcx, 112(%rdi)
+; SSE-NEXT: movq %rcx, 104(%rdi)
+; SSE-NEXT: movq %rcx, 96(%rdi)
+; SSE-NEXT: movq %rcx, 88(%rdi)
+; SSE-NEXT: movq %rcx, 80(%rdi)
+; SSE-NEXT: movq %rcx, 72(%rdi)
+; SSE-NEXT: movq %rcx, 64(%rdi)
+; SSE-NEXT: movq %rcx, 56(%rdi)
+; SSE-NEXT: movq %rcx, 48(%rdi)
+; SSE-NEXT: movq %rcx, 40(%rdi)
+; SSE-NEXT: movq %rcx, 32(%rdi)
+; SSE-NEXT: movq %rcx, 24(%rdi)
+; SSE-NEXT: movq %rcx, 16(%rdi)
+; SSE-NEXT: movq %rcx, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: retq
+;
+; SSE2FAST-LABEL: memset_128_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_128_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_128_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i32 1, i1 false)
+ ret void
+}
+
+define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
+; SSE-LABEL: memset_256_nonconst_bytes:
+; SSE: # BB#0:
+; SSE-NEXT: movl $256, %edx # imm = 0x100
+; SSE-NEXT: jmp memset # TAILCALL
+;
+; SSE2FAST-LABEL: memset_256_nonconst_bytes:
+; SSE2FAST: # BB#0:
+; SSE2FAST-NEXT: movd %esi, %xmm0
+; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 192(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 176(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 160(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 144(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 128(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi)
+; SSE2FAST-NEXT: movdqu %xmm0, (%rdi)
+; SSE2FAST-NEXT: retq
+;
+; AVX1-LABEL: memset_256_nonconst_bytes:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, 224(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 192(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 160(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 128(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: memset_256_nonconst_bytes:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+ tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index 8cfa032797f7..861cb88b0f57 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -1,12 +1,60 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 5
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movl | count 20
-; RUN: llc < %s -mtriple=i386-pc-mingw32 -mcpu=core2 | grep movl | count 20
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | grep movq | count 10
+; NOTE: Assertions have been autogenerated by update_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=SLOW_32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=SLOW_64
define void @bork() nounwind {
-entry:
+; FAST-LABEL: bork:
+; FAST: # BB#0:
+; FAST-NEXT: xorps %xmm0, %xmm0
+; FAST-NEXT: movups %xmm0, 64
+; FAST-NEXT: movups %xmm0, 48
+; FAST-NEXT: movups %xmm0, 32
+; FAST-NEXT: movups %xmm0, 16
+; FAST-NEXT: movups %xmm0, 0
+; FAST-NEXT: retl
+;
+; SLOW_32-LABEL: bork:
+; SLOW_32: # BB#0:
+; SLOW_32-NEXT: movl $0, 4
+; SLOW_32-NEXT: movl $0, 0
+; SLOW_32-NEXT: movl $0, 12
+; SLOW_32-NEXT: movl $0, 8
+; SLOW_32-NEXT: movl $0, 20
+; SLOW_32-NEXT: movl $0, 16
+; SLOW_32-NEXT: movl $0, 28
+; SLOW_32-NEXT: movl $0, 24
+; SLOW_32-NEXT: movl $0, 36
+; SLOW_32-NEXT: movl $0, 32
+; SLOW_32-NEXT: movl $0, 44
+; SLOW_32-NEXT: movl $0, 40
+; SLOW_32-NEXT: movl $0, 52
+; SLOW_32-NEXT: movl $0, 48
+; SLOW_32-NEXT: movl $0, 60
+; SLOW_32-NEXT: movl $0, 56
+; SLOW_32-NEXT: movl $0, 68
+; SLOW_32-NEXT: movl $0, 64
+; SLOW_32-NEXT: movl $0, 76
+; SLOW_32-NEXT: movl $0, 72
+; SLOW_32-NEXT: retl
+;
+; SLOW_64-LABEL: bork:
+; SLOW_64: # BB#0:
+; SLOW_64-NEXT: movq $0, 72
+; SLOW_64-NEXT: movq $0, 64
+; SLOW_64-NEXT: movq $0, 56
+; SLOW_64-NEXT: movq $0, 48
+; SLOW_64-NEXT: movq $0, 40
+; SLOW_64-NEXT: movq $0, 32
+; SLOW_64-NEXT: movq $0, 24
+; SLOW_64-NEXT: movq $0, 16
+; SLOW_64-NEXT: movq $0, 8
+; SLOW_64-NEXT: movq $0, 0
+; SLOW_64-NEXT: retq
+;
call void @llvm.memset.p0i8.i64(i8* null, i8 0, i64 80, i32 4, i1 false)
ret void
}
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
new file mode 100644
index 000000000000..59b7efdf9bf8
--- /dev/null
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -0,0 +1,783 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
+;
+; Just one 32-bit run to make sure we do reasonable things.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
+
+define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_2f64_f64_23:
+; SSE: # BB#0:
+; SSE-NEXT: movups 16(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_2f64_f64_23:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 16(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_2f64_f64_23:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 16(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <2 x double> undef, double %val0, i32 0
+ %res1 = insertelement <2 x double> %res0, double %val1, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_2i64_i64_12:
+; SSE: # BB#0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_2i64_i64_12:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_2i64_i64_12:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 8(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
+ ret <2 x i64> %res1
+}
+
+define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_2345:
+; SSE: # BB#0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_2345:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_2345:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 8(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float %val3, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_3zuu:
+; SSE: # BB#0:
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_3zuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_3zuu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
+ %val0 = load float, float* %ptr0
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
+ ret <4 x float> %res1
+}
+
+define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_34uu:
+; SSE: # BB#0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_34uu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_34uu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ ret <4 x float> %res1
+}
+
+define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_34z6:
+; SSE2: # BB#0:
+; SSE2-NEXT: movups 12(%rdi), %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_34z6:
+; SSE41: # BB#0:
+; SSE41-NEXT: movups 12(%rdi), %xmm1
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_34z6:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_34z6:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 12(%eax), %xmm1
+; X32-SSE-NEXT: xorps %xmm0, %xmm0
+; X32-SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res3 = insertelement <4 x float> %res1, float %val3, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_45zz:
+; SSE: # BB#0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_45zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_45zz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ ret <4 x float> %res1
+}
+
+define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_012u:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_012u:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_012u:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_012u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float undef, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_019u:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_019u:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_019u:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_019u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float undef, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_23u5:
+; SSE: # BB#0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_23u5:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_23u5:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 8(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %val3 = load i32, i32* %ptr3
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
+ ret <4 x i32> %res3
+}
+
+define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_3zuu:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_3zuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_3zuu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %val0 = load i32, i32* %ptr0
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
+ ret <4 x i32> %res1
+}
+
+define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_34uu:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_34uu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_34uu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ ret <4 x i32> %res1
+}
+
+define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_45zz:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_45zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_45zz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ ret <4 x i32> %res1
+}
+
+define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_8i16_i16_23u567u9:
+; SSE: # BB#0:
+; SSE-NEXT: movups 4(%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_8i16_i16_23u567u9:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 4(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_8i16_i16_23u567u9:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups 4(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6
+ %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %val4 = load i16, i16* %ptr4
+ %val5 = load i16, i16* %ptr5
+ %val7 = load i16, i16* %ptr7
+ %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
+ %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
+ %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
+ ret <8 x i16> %res7
+}
+
+define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_8i16_i16_34uuuuuu:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_8i16_i16_34uuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_8i16_i16_34uuuuuu:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
+ ret <8 x i16> %res1
+}
+
+define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_8i16_i16_45u7zzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_8i16_i16_45u7zzzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_8i16_i16_45u7zzzz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
+ %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 0, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 0, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 0, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 0, i32 7
+ ret <8 x i16> %res7
+}
+
+define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movups (%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
+ %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8
+ %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9
+ %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10
+ %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11
+ %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12
+ %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13
+ %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %val4 = load i8, i8* %ptr4
+ %val5 = load i8, i8* %ptr5
+ %val6 = load i8, i8* %ptr6
+ %val7 = load i8, i8* %ptr7
+ %val8 = load i8, i8* %ptr8
+ %val9 = load i8, i8* %ptr9
+ %valA = load i8, i8* %ptrA
+ %valB = load i8, i8* %ptrB
+ %valC = load i8, i8* %ptrC
+ %valD = load i8, i8* %ptrD
+ %valF = load i8, i8* %ptrF
+ %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
+ %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
+ %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
+ %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
+ %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
+ %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
+ %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
+ ret <16 x i8> %resF
+}
+
+define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
+ %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
+ %res6 = insertelement <16 x i8> %res3, i8 0, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 0, i32 7
+ %resD = insertelement <16 x i8> %res7, i8 0, i32 13
+ %resE = insertelement <16 x i8> %resD, i8 0, i32 14
+ %resF = insertelement <16 x i8> %resE, i8 0, i32 15
+ ret <16 x i8> %resF
+}
+
+define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
+ %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val2 = load i8, i8* %ptr2
+ %val3 = load i8, i8* %ptr3
+ %val6 = load i8, i8* %ptr6
+ %val7 = load i8, i8* %ptr7
+ %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
+ %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
+ %resD = insertelement <16 x i8> %res7, i8 0, i32 13
+ %resE = insertelement <16 x i8> %resD, i8 0, i32 14
+ %resF = insertelement <16 x i8> %resE, i8 0, i32 15
+ ret <16 x i8> %resF
+}
+
+define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
+; SSE-LABEL: merge_4i32_i32_combine:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movaps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: merge_4i32_i32_combine:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovaps %xmm0, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_4i32_i32_combine:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovaps %xmm0, (%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_4i32_i32_combine:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4i32_i32_combine:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: movaps %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+ %1 = getelementptr i32, i32* %src, i32 0
+ %2 = load i32, i32* %1
+ %3 = insertelement <4 x i32> undef, i32 %2, i32 0
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+ %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef>
+ %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0>
+ store <4 x i32> %6, <4 x i32>* %dst
+ ret void
+}
+
+;
+; consecutive loads including any/all volatiles may not be combined
+;
+
+define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_2i64_i64_12_volatile:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_2i64_i64_12_volatile:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_2i64_i64_12_volatile:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: pinsrd $1, 12(%eax), %xmm0
+; X32-SSE-NEXT: pinsrd $2, 16(%eax), %xmm0
+; X32-SSE-NEXT: pinsrd $3, 20(%eax), %xmm0
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %val0 = load volatile i64, i64* %ptr0
+ %val1 = load volatile i64, i64* %ptr1
+ %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
+ ret <2 x i64> %res1
+}
+
+define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_4f32_f32_2345_volatile:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_4f32_f32_2345_volatile:
+; SSE41: # BB#0:
+; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_2345_volatile:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_2345_volatile:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
+ %val0 = load volatile float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val2 = load float, float* %ptr2
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i32 2
+ %res3 = insertelement <4 x float> %res2, float %val3, i32 3
+ ret <4 x float> %res3
+}
+
+;
+; Non-consecutive test.
+;
+
+define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4f32_f32_X0YY:
+; SSE: # BB#0:
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4f32_f32_X0YY:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: merge_4f32_f32_X0YY:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
+ %val0 = load float, float* %ptr0, align 4
+ %val1 = load float, float* %ptr1, align 4
+ %res0 = insertelement <4 x float> undef, float %val0, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
+ %res2 = insertelement <4 x float> %res1, float %val1, i32 2
+ %res3 = insertelement <4 x float> %res2, float %val1, i32 3
+ ret <4 x float> %res3
+}
diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll
new file mode 100644
index 000000000000..8c2e93729004
--- /dev/null
+++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -0,0 +1,756 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
+;
+; Just one 32-bit run to make sure we do reasonable things.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+
+define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_2f64_23:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 32(%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_2f64_23:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 32(%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %val1 = load <2 x double>, <2 x double>* %ptr1
+ %res = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_2f64_2z:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps 32(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_2f64_2z:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %res = shufflevector <2 x double> %val0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <4 x double> @merge_4f64_f64_2345(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_2345:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 16(%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_2345:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 16(%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr2 = getelementptr inbounds double, double* %ptr, i64 4
+ %ptr3 = getelementptr inbounds double, double* %ptr, i64 5
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val2 = load double, double* %ptr2
+ %val3 = load double, double* %ptr3
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ %res2 = insertelement <4 x double> %res1, double %val2, i32 2
+ %res3 = insertelement <4 x double> %res2, double %val3, i32 3
+ ret <4 x double> %res3
+}
+
+define <4 x double> @merge_4f64_f64_3zuu(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_3zuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_3zuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %val0 = load double, double* %ptr0
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double 0.0, i32 1
+ ret <4 x double> %res1
+}
+
+define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_34uu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 24(%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_34uu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 24(%eax), %xmm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ ret <4 x double> %res1
+}
+
+define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_45zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 32(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_45zz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 32(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 4
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 5
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <4 x double> zeroinitializer, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ ret <4 x double> %res1
+}
+
+define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4f64_f64_34z6:
+; AVX: # BB#0:
+; AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_34z6:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
+ %ptr3 = getelementptr inbounds double, double* %ptr, i64 6
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val3 = load double, double* %ptr3
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ %res2 = insertelement <4 x double> %res1, double 0.0, i32 2
+ %res3 = insertelement <4 x double> %res2, double %val3, i32 3
+ ret <4 x double> %res3
+}
+
+define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_2i64_3z:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps 48(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_2i64_3z:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3
+ %val0 = load <2 x i64>, <2 x i64>* %ptr0
+ %res = shufflevector <2 x i64> %val0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @merge_4i64_i64_1234(i64* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_i64_1234:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 8(%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_i64_1234:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 8(%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i64, i64* %ptr, i64 4
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %val2 = load i64, i64* %ptr2
+ %val3 = load i64, i64* %ptr3
+ %res0 = insertelement <4 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %val1, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 %val2, i32 2
+ %res3 = insertelement <4 x i64> %res2, i64 %val3, i32 3
+ ret <4 x i64> %res3
+}
+
+define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_i64_1zzu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_i64_1zzu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %val0 = load i64, i64* %ptr0
+ %res0 = insertelement <4 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 0, i32 1
+ %res2 = insertelement <4 x i64> %res1, i64 0, i32 1
+ ret <4 x i64> %res2
+}
+
+define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_4i64_i64_23zz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups 16(%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4i64_i64_23zz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups 16(%eax), %xmm0
+; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %res0 = insertelement <4 x i64> zeroinitializer, i64 %val0, i32 0
+ %res1 = insertelement <4 x i64> %res0, i64 %val1, i32 1
+ ret <4 x i64> %res1
+}
+
+define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8f32_2f32_23z5:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovupd 16(%rdi), %xmm0
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8f32_2f32_23z5:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovupd 16(%rdi), %xmm0
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8f32_2f32_23z5:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups 16(%rdi), %xmm0
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_2f32_23z5:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
+ %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
+ %ptr3 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 5
+ %val0 = load <2 x float>, <2 x float>* %ptr0
+ %val1 = load <2 x float>, <2 x float>* %ptr1
+ %val3 = load <2 x float>, <2 x float>* %ptr3
+ %res01 = shufflevector <2 x float> %val0, <2 x float> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res23 = shufflevector <2 x float> zeroinitializer, <2 x float> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x float> %res01, <4 x float> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8f32_4f32_z2:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_4f32_z2:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: vinsertf128 $1, 32(%eax), %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
+ %val1 = load <4 x float>, <4 x float>* %ptr1
+ %res = shufflevector <4 x float> zeroinitializer, <4 x float> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8f32_f32_12zzuuzz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_f32_12zzuuzz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 2
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <8 x float> undef, float %val0, i32 0
+ %res1 = insertelement <8 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <8 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <8 x float> %res2, float 0.0, i32 3
+ %res6 = insertelement <8 x float> %res3, float 0.0, i32 6
+ %res7 = insertelement <8 x float> %res6, float 0.0, i32 7
+ ret <8 x float> %res7
+}
+
+define <8 x float> @merge_8f32_f32_1u3u5zu8(float* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8f32_f32_1u3u5zu8:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8f32_f32_1u3u5zu8:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 1
+ %ptr2 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptr4 = getelementptr inbounds float, float* %ptr, i64 5
+ %ptr7 = getelementptr inbounds float, float* %ptr, i64 8
+ %val0 = load float, float* %ptr0
+ %val2 = load float, float* %ptr2
+ %val4 = load float, float* %ptr4
+ %val7 = load float, float* %ptr7
+ %res0 = insertelement <8 x float> undef, float %val0, i32 0
+ %res2 = insertelement <8 x float> %res0, float %val2, i32 2
+ %res4 = insertelement <8 x float> %res2, float %val4, i32 4
+ %res5 = insertelement <8 x float> %res4, float 0.0, i32 5
+ %res7 = insertelement <8 x float> %res5, float %val7, i32 7
+ ret <8 x float> %res7
+}
+
+define <8 x i32> @merge_8i32_4i32_z3(<4 x i32>* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_8i32_4i32_z3:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8i32_4i32_z3:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: vinsertf128 $1, 48(%eax), %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
+ %val1 = load <4 x i32>, <4 x i32>* %ptr1
+ %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @merge_8i32_i32_56zz9uzz(i32* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8i32_i32_56zz9uzz:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8i32_i32_56zz9uzz:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8i32_i32_56zz9uzz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8i32_i32_56zz9uzz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 6
+ %ptr4 = getelementptr inbounds i32, i32* %ptr, i64 9
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %val4 = load i32, i32* %ptr4
+ %res0 = insertelement <8 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <8 x i32> %res0, i32 %val1, i32 1
+ %res2 = insertelement <8 x i32> %res1, i32 0, i32 2
+ %res3 = insertelement <8 x i32> %res2, i32 0, i32 3
+ %res4 = insertelement <8 x i32> %res3, i32 %val4, i32 4
+ %res6 = insertelement <8 x i32> %res4, i32 0, i32 6
+ %res7 = insertelement <8 x i32> %res6, i32 0, i32 7
+ ret <8 x i32> %res7
+}
+
+define <8 x i32> @merge_8i32_i32_1u3u5zu8(i32* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8i32_i32_1u3u5zu8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8i32_i32_1u3u5zu8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8i32_i32_1u3u5zu8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_8i32_i32_1u3u5zu8:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
+ %ptr2 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr4 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %ptr7 = getelementptr inbounds i32, i32* %ptr, i64 8
+ %val0 = load i32, i32* %ptr0
+ %val2 = load i32, i32* %ptr2
+ %val4 = load i32, i32* %ptr4
+ %val7 = load i32, i32* %ptr7
+ %res0 = insertelement <8 x i32> undef, i32 %val0, i32 0
+ %res2 = insertelement <8 x i32> %res0, i32 %val2, i32 2
+ %res4 = insertelement <8 x i32> %res2, i32 %val4, i32 4
+ %res5 = insertelement <8 x i32> %res4, i32 0, i32 5
+ %res7 = insertelement <8 x i32> %res5, i32 %val7, i32 7
+ ret <8 x i32> %res7
+}
+
+define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 8
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 9
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <16 x i16> %res0, i16 %val1, i16 1
+ %res2 = insertelement <16 x i16> %res1, i16 0, i16 2
+ %res3 = insertelement <16 x i16> %res2, i16 0, i16 3
+ %res4 = insertelement <16 x i16> %res3, i16 0, i16 4
+ %resF = insertelement <16 x i16> %res4, i16 0, i16 15
+ ret <16 x i16> %resF
+}
+
+define <16 x i16> @merge_16i16_i16_45u7uuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <16 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <16 x i16> %res1, i16 %val3, i16 3
+ ret <16 x i16> %res3
+}
+
+define <16 x i16> @merge_16i16_i16_0uu3uuuuuuuuCuEF(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovups (%eax), %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
+ %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
+ %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
+ %val0 = load i16, i16* %ptr0
+ %val3 = load i16, i16* %ptr3
+ %valC = load i16, i16* %ptrC
+ %valE = load i16, i16* %ptrE
+ %valF = load i16, i16* %ptrF
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
+ %resC = insertelement <16 x i16> %res3, i16 %valC, i16 12
+ %resE = insertelement <16 x i16> %resC, i16 %valE, i16 14
+ %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
+ ret <16 x i16> %resF
+}
+
+define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,0,0,0,0,0,0,0,0,65535,0,65535,65535]
+; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,65535,0,0,0,0,0,0,0,0,65535,0,65535,65535]
+; X32-AVX-NEXT: vandps (%eax), %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
+ %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
+ %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
+ %val0 = load i16, i16* %ptr0
+ %val3 = load i16, i16* %ptr3
+ %valC = load i16, i16* %ptrC
+ %valE = load i16, i16* %ptrE
+ %valF = load i16, i16* %ptrF
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
+ %res4 = insertelement <16 x i16> %res3, i16 0, i16 4
+ %res5 = insertelement <16 x i16> %res4, i16 0, i16 5
+ %resC = insertelement <16 x i16> %res5, i16 %valC, i16 12
+ %resD = insertelement <16 x i16> %resC, i16 0, i16 13
+ %resE = insertelement <16 x i16> %resD, i16 %valE, i16 14
+ %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
+ ret <16 x i16> %resF
+}
+
+define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <32 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <32 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <32 x i8> %res1, i8 %val3, i8 3
+ ret <32 x i8> %res3
+}
+
+define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: retq
+;
+; X32-AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <32 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <32 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <32 x i8> %res1, i8 %val3, i8 3
+ %resE = insertelement <32 x i8> %res3, i8 0, i8 14
+ %resF = insertelement <32 x i8> %resE, i8 0, i8 15
+ %resG = insertelement <32 x i8> %resF, i8 0, i8 16
+ %resH = insertelement <32 x i8> %resG, i8 0, i8 17
+ ret <32 x i8> %resH
+}
+
+;
+; consecutive loads including any/all volatiles may not be combined
+;
+
+define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_4f64_f64_34uz_volatile:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_4f64_f64_34uz_volatile:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_4f64_f64_34uz_volatile:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_4f64_f64_34uz_volatile:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
+ %val0 = load volatile double, double* %ptr0
+ %val1 = load volatile double, double* %ptr1
+ %res0 = insertelement <4 x double> undef, double %val0, i32 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i32 1
+ %res3 = insertelement <4 x double> %res1, double 0.0, i32 3
+ ret <4 x double> %res3
+}
+
+define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
+; AVX1-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
+; AVX2-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
+; AVX512F-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; X32-AVX-NEXT: vpinsrw $0, (%eax), %xmm0, %xmm1
+; X32-AVX-NEXT: vpinsrw $3, 6(%eax), %xmm1, %xmm1
+; X32-AVX-NEXT: vpinsrw $4, 24(%eax), %xmm0, %xmm0
+; X32-AVX-NEXT: vpinsrw $6, 28(%eax), %xmm0, %xmm0
+; X32-AVX-NEXT: vpinsrw $7, 30(%eax), %xmm0, %xmm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
+ %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
+ %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
+ %val0 = load volatile i16, i16* %ptr0
+ %val3 = load i16, i16* %ptr3
+ %valC = load i16, i16* %ptrC
+ %valE = load i16, i16* %ptrE
+ %valF = load volatile i16, i16* %ptrF
+ %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
+ %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
+ %res4 = insertelement <16 x i16> %res3, i16 0, i16 4
+ %res5 = insertelement <16 x i16> %res4, i16 0, i16 5
+ %resC = insertelement <16 x i16> %res5, i16 %valC, i16 12
+ %resD = insertelement <16 x i16> %resC, i16 0, i16 13
+ %resE = insertelement <16 x i16> %resD, i16 %valE, i16 14
+ %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
+ ret <16 x i16> %resF
+}
diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll
new file mode 100644
index 000000000000..bb9a342ae9ae
--- /dev/null
+++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -0,0 +1,718 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+;
+; Just one 32-bit run to make sure we do reasonable things.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32-AVX512F
+
+define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_2f64_12u4:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 16(%rdi), %ymm0
+; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_2f64_12u4:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 16(%eax), %ymm0
+; X32-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1
+ %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 4
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %val1 = load <2 x double>, <2 x double>* %ptr1
+ %val3 = load <2 x double>, <2 x double>* %ptr3
+ %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_2f64_23z5:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 32(%rdi), %ymm0
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_2f64_23z5:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 32(%eax), %ymm0
+; X32-AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
+ %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
+ %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 5
+ %val0 = load <2 x double>, <2 x double>* %ptr0
+ %val1 = load <2 x double>, <2 x double>* %ptr1
+ %val3 = load <2 x double>, <2 x double>* %ptr3
+ %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_4f64_z2:
+; ALL: # BB#0:
+; ALL-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_4f64_z2:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2
+ %val1 = load <4 x double>, <4 x double>* %ptr1
+ %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_23uuuuu9:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 16(%rdi), %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 16(%eax), %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val7 = load double, double* %ptr7
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res1 = insertelement <8 x double> %res0, double %val1, i32 1
+ %res7 = insertelement <8 x double> %res1, double %val7, i32 7
+ ret <8 x double> %res7
+}
+
+define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_12zzuuzz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 8(%rdi), %xmm0
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 8(%eax), %xmm0
+; X32-AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 2
+ %val0 = load double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res1 = insertelement <8 x double> %res0, double %val1, i32 1
+ %res2 = insertelement <8 x double> %res1, double 0.0, i32 2
+ %res3 = insertelement <8 x double> %res2, double 0.0, i32 3
+ %res6 = insertelement <8 x double> %res3, double 0.0, i32 6
+ %res7 = insertelement <8 x double> %res6, double 0.0, i32 7
+ ret <8 x double> %res7
+}
+
+define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
+; ALL: # BB#0:
+; ALL-NEXT: vmovupd 8(%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7>
+; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
+; X32-AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 1
+ %ptr2 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr4 = getelementptr inbounds double, double* %ptr, i64 5
+ %ptr7 = getelementptr inbounds double, double* %ptr, i64 8
+ %val0 = load double, double* %ptr0
+ %val2 = load double, double* %ptr2
+ %val4 = load double, double* %ptr4
+ %val7 = load double, double* %ptr7
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res2 = insertelement <8 x double> %res0, double %val2, i32 2
+ %res4 = insertelement <8 x double> %res2, double %val4, i32 4
+ %res5 = insertelement <8 x double> %res4, double 0.0, i32 5
+ %res7 = insertelement <8 x double> %res5, double %val7, i32 7
+ ret <8 x double> %res7
+}
+
+define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8i64_4i64_z3:
+; ALL: # BB#0:
+; ALL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, 96(%rdi), %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8i64_4i64_z3:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinserti64x4 $1, 96(%eax), %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3
+ %val1 = load <4 x i64>, <4 x i64>* %ptr1
+ %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8i64_i64_56zz9uzz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu 40(%rdi), %xmm0
+; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu 40(%eax), %xmm0
+; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5
+ %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6
+ %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 9
+ %val0 = load i64, i64* %ptr0
+ %val1 = load i64, i64* %ptr1
+ %val4 = load i64, i64* %ptr4
+ %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
+ %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1
+ %res2 = insertelement <8 x i64> %res1, i64 0, i32 2
+ %res3 = insertelement <8 x i64> %res2, i64 0, i32 3
+ %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4
+ %res6 = insertelement <8 x i64> %res4, i64 0, i32 6
+ %res7 = insertelement <8 x i64> %res6, i64 0, i32 7
+ ret <8 x i64> %res7
+}
+
+define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7>
+; ALL-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0>
+; X32-AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
+ %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
+ %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5
+ %ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8
+ %val0 = load i64, i64* %ptr0
+ %val2 = load i64, i64* %ptr2
+ %val4 = load i64, i64* %ptr4
+ %val7 = load i64, i64* %ptr7
+ %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0
+ %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2
+ %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4
+ %res5 = insertelement <8 x i64> %res4, i64 0, i32 5
+ %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7
+ ret <8 x i64> %res7
+}
+
+define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 8
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 9
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res1 = insertelement <16 x float> %res0, float %val1, i32 1
+ %res2 = insertelement <16 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <16 x float> %res2, float 0.0, i32 3
+ %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
+ %resF = insertelement <16 x float> %res4, float 0.0, i32 15
+ ret <16 x float> %resF
+}
+
+define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups 16(%rdi), %xmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups 16(%eax), %xmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
+ %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 7
+ %val0 = load float, float* %ptr0
+ %val1 = load float, float* %ptr1
+ %val3 = load float, float* %ptr3
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res1 = insertelement <16 x float> %res0, float %val1, i32 1
+ %res3 = insertelement <16 x float> %res1, float %val3, i32 3
+ ret <16 x float> %res3
+}
+
+define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups (%rdi), %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptrC = getelementptr inbounds float, float* %ptr, i64 12
+ %ptrE = getelementptr inbounds float, float* %ptr, i64 14
+ %ptrF = getelementptr inbounds float, float* %ptr, i64 15
+ %val0 = load float, float* %ptr0
+ %val3 = load float, float* %ptr3
+ %valC = load float, float* %ptrC
+ %valE = load float, float* %ptrE
+ %valF = load float, float* %ptrF
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res3 = insertelement <16 x float> %res0, float %val3, i32 3
+ %resC = insertelement <16 x float> %res3, float %valC, i32 12
+ %resE = insertelement <16 x float> %resC, float %valE, i32 14
+ %resF = insertelement <16 x float> %resE, float %valF, i32 15
+ ret <16 x float> %resF
+}
+
+define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups (%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; X32-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
+ %ptr3 = getelementptr inbounds float, float* %ptr, i64 3
+ %ptrC = getelementptr inbounds float, float* %ptr, i64 12
+ %ptrE = getelementptr inbounds float, float* %ptr, i64 14
+ %ptrF = getelementptr inbounds float, float* %ptr, i64 15
+ %val0 = load float, float* %ptr0
+ %val3 = load float, float* %ptr3
+ %valC = load float, float* %ptrC
+ %valE = load float, float* %ptrE
+ %valF = load float, float* %ptrF
+ %res0 = insertelement <16 x float> undef, float %val0, i32 0
+ %res3 = insertelement <16 x float> %res0, float %val3, i32 3
+ %res4 = insertelement <16 x float> %res3, float 0.0, i32 4
+ %res5 = insertelement <16 x float> %res4, float 0.0, i32 5
+ %resC = insertelement <16 x float> %res5, float %valC, i32 12
+ %resD = insertelement <16 x float> %resC, float 0.0, i32 13
+ %resE = insertelement <16 x float> %resD, float %valE, i32 14
+ %resF = insertelement <16 x float> %resE, float %valF, i32 15
+ ret <16 x float> %resF
+}
+
+define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
+ %res2 = insertelement <16 x i32> %res1, i32 0, i32 2
+ %res3 = insertelement <16 x i32> %res2, i32 0, i32 3
+ %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
+ %resF = insertelement <16 x i32> %res4, i32 0, i32 15
+ ret <16 x i32> %resF
+}
+
+define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups 8(%rdi), %xmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %val3 = load i32, i32* %ptr3
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1
+ %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3
+ ret <16 x i32> %res3
+}
+
+define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu32 (%rdi), %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
+ %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
+ %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
+ %val0 = load i32, i32* %ptr0
+ %val3 = load i32, i32* %ptr3
+ %valC = load i32, i32* %ptrC
+ %valE = load i32, i32* %ptrE
+ %valF = load i32, i32* %ptrF
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
+ %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
+ %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
+ %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
+ ret <16 x i32> %resF
+}
+
+define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
+; ALL: # BB#0:
+; ALL-NEXT: vmovdqu32 (%rdi), %zmm0
+; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0
+; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
+; X32-AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
+ %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
+ %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
+ %val0 = load i32, i32* %ptr0
+ %val3 = load i32, i32* %ptr3
+ %valC = load i32, i32* %ptrC
+ %valE = load i32, i32* %ptrE
+ %valF = load i32, i32* %ptrF
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
+ %res4 = insertelement <16 x i32> %res3, i32 0, i32 4
+ %res5 = insertelement <16 x i32> %res4, i32 0, i32 5
+ %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12
+ %resD = insertelement <16 x i32> %resC, i32 0, i32 13
+ %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14
+ %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
+ ret <16 x i32> %resF
+}
+
+define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
+ %res30 = insertelement <32 x i16> %res3, i16 0, i16 30
+ %res31 = insertelement <32 x i16> %res30, i16 0, i16 31
+ ret <32 x i16> %res31
+}
+
+define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
+ %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %val3 = load i16, i16* %ptr3
+ %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3
+ ret <32 x i16> %res3
+}
+
+define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
+ %val0 = load i16, i16* %ptr0
+ %val1 = load i16, i16* %ptr1
+ %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0
+ %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1
+ %res3 = insertelement <32 x i16> %res1, i16 0, i16 3
+ %resE = insertelement <32 x i16> %res3, i16 0, i16 14
+ %resF = insertelement <32 x i16> %resE, i16 0, i16 15
+ %resG = insertelement <32 x i16> %resF, i16 0, i16 16
+ %resH = insertelement <32 x i16> %resG, i16 0, i16 17
+ ret <32 x i16> %resH
+}
+
+define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 8
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %val7 = load i8, i8* %ptr7
+ %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
+ %res7 = insertelement <64 x i8> %res3, i8 %val7, i8 7
+ %res14 = insertelement <64 x i8> %res7, i8 0, i8 14
+ %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
+ %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
+ %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
+ %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
+ ret <64 x i8> %res63
+}
+
+define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
+; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %val0 = load i8, i8* %ptr0
+ %val1 = load i8, i8* %ptr1
+ %val3 = load i8, i8* %ptr3
+ %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0
+ %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1
+ %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3
+ %res14 = insertelement <64 x i8> %res3, i8 0, i8 14
+ %res15 = insertelement <64 x i8> %res14, i8 0, i8 15
+ %res16 = insertelement <64 x i8> %res15, i8 0, i8 16
+ %res17 = insertelement <64 x i8> %res16, i8 0, i8 17
+ %res63 = insertelement <64 x i8> %res17, i8 0, i8 63
+ ret <64 x i8> %res63
+}
+
+;
+; consecutive loads including any/all volatiles may not be combined
+;
+
+define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-AVX512F-NEXT: vbroadcastsd 72(%eax), %ymm1
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
+ %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
+ %ptr7 = getelementptr inbounds double, double* %ptr, i64 9
+ %val0 = load volatile double, double* %ptr0
+ %val1 = load double, double* %ptr1
+ %val7 = load double, double* %ptr7
+ %res0 = insertelement <8 x double> undef, double %val0, i32 0
+ %res1 = insertelement <8 x double> %res0, double %val1, i32 1
+ %res7 = insertelement <8 x double> %res1, double %val7, i32 7
+ ret <8 x double> %res7
+}
+
+define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
+; ALL: # BB#0:
+; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0
+; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT: vpinsrd $2, 56(%rdi), %xmm1, %xmm1
+; ALL-NEXT: vpinsrd $3, 60(%rdi), %xmm1, %xmm1
+; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
+; X32-AVX512F: # BB#0:
+; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
+; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-AVX512F-NEXT: vpinsrd $2, 56(%eax), %xmm1, %xmm1
+; X32-AVX512F-NEXT: vpinsrd $3, 60(%eax), %xmm1, %xmm1
+; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12
+ %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14
+ %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15
+ %val0 = load volatile i32, i32* %ptr0
+ %val3 = load volatile i32, i32* %ptr3
+ %valC = load volatile i32, i32* %ptrC
+ %valE = load volatile i32, i32* %ptrE
+ %valF = load volatile i32, i32* %ptrF
+ %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0
+ %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3
+ %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12
+ %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14
+ %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15
+ ret <16 x i32> %resF
+}
diff --git a/test/CodeGen/X86/merge-sp-update-lea.ll b/test/CodeGen/X86/merge-sp-update-lea.ll
new file mode 100644
index 000000000000..70209a2aec92
--- /dev/null
+++ b/test/CodeGen/X86/merge-sp-update-lea.ll
@@ -0,0 +1,32 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.5"
+
+; Check that the merging of SP updates, when LEAs are involved, happen
+; correctly.
+; CHECK-LABEL: useLEA:
+; CHECK: calll _realloc
+; Make sure that the offset we get here is 8 + 16.
+; We used to have 8 + 1 because we were not reading the right immediate form
+; the LEA instruction.
+; CHECK-NEXT: leal 24(%esp), %esp
+define noalias i8* @useLEA(i8* nocapture %p, i32 %nbytes) #0 {
+entry:
+ %cmp = icmp slt i32 %nbytes, 0
+ br i1 %cmp, label %cond.end.3, label %cond.false
+
+cond.false: ; preds = %entry
+ %tobool = icmp ne i32 %nbytes, 0
+ %cond = select i1 %tobool, i32 %nbytes, i32 1
+ %call = tail call i8* @realloc(i8* %p, i32 %cond)
+ br label %cond.end.3
+
+cond.end.3: ; preds = %entry, %cond.false
+ %cond4 = phi i8* [ %call, %cond.false ], [ null, %entry ]
+ ret i8* %cond4
+}
+
+; Function Attrs: nounwind optsize
+declare noalias i8* @realloc(i8* nocapture, i32)
+
+attributes #0 = { nounwind optsize ssp "disable-tail-calls"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-features"="+lea-sp" }
diff --git a/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
index 8e148aa76d38..735e64a076d0 100644
--- a/test/CodeGen/X86/merge-store-partially-alias-loads.ll
+++ b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
@@ -6,10 +6,10 @@
; they must not be placed on the same chain after merging.
; X86-LABEL: {{^}}merge_store_partial_overlap_load:
-; X86-DAG: movw ([[BASEREG:%[a-z]+]]), [[LO2:%[a-z]+]]
+; X86-DAG: movzwl ([[BASEREG:%[a-z]+]]), %e[[LO2:[a-z]+]]
; X86-DAG: movb 2([[BASEREG]]), [[HI1:%[a-z]+]]
-; X86-NEXT: movw [[LO2]], 1([[BASEREG]])
+; X86-NEXT: movw %[[LO2]], 1([[BASEREG]])
; X86-NEXT: movb [[HI1]], 3([[BASEREG]])
; X86-NEXT: retq
diff --git a/test/CodeGen/X86/mfence.ll b/test/CodeGen/X86/mfence.ll
index 6056adddcb4b..b67a5c355044 100644
--- a/test/CodeGen/X86/mfence.ll
+++ b/test/CodeGen/X86/mfence.ll
@@ -1,8 +1,37 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep sfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep lfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep mfence
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64
+
+; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence.
define void @test() {
+; X32-LABEL: test:
+; X32: # BB#0:
+; X32-NEXT: mfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # BB#0:
+; X64-NEXT: mfence
+; X64-NEXT: retq
fence seq_cst
ret void
}
+
+define i32 @fence(i32* %ptr) {
+; X32-LABEL: fence:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: mfence
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: fence:
+; X64: # BB#0:
+; X64-NEXT: mfence
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: retq
+ %atomic = atomicrmw add i32* %ptr, i32 0 seq_cst
+ ret i32 %atomic
+}
+
diff --git a/test/CodeGen/X86/mingw-alloca.ll b/test/CodeGen/X86/mingw-alloca.ll
index cbad4fbfa2ea..44097e983689 100644
--- a/test/CodeGen/X86/mingw-alloca.ll
+++ b/test/CodeGen/X86/mingw-alloca.ll
@@ -22,12 +22,12 @@ entry:
; COFF: andl $-16, %esp
; COFF: pushl %eax
; COFF: calll __alloca
-; COFF: movl 8028(%esp), %eax
+; COFF: movl 8012(%esp), %eax
; ELF: foo2:
; ELF: andl $-16, %esp
; ELF: pushl %eax
; ELF: calll _alloca
-; ELF: movl 8028(%esp), %eax
+; ELF: movl 8012(%esp), %eax
%A2 = alloca [2000 x i32], align 16 ; <[2000 x i32]*> [#uses=1]
%A2.sub = getelementptr [2000 x i32], [2000 x i32]* %A2, i32 0, i32 0 ; <i32*> [#uses=1]
call void @bar2( i32* %A2.sub, i32 %N )
diff --git a/test/CodeGen/X86/misched-aa-colored.ll b/test/CodeGen/X86/misched-aa-colored.ll
index ef7b98ac9c69..9f8f3a946e66 100644
--- a/test/CodeGen/X86/misched-aa-colored.ll
+++ b/test/CodeGen/X86/misched-aa-colored.ll
@@ -155,6 +155,7 @@ entry:
%ref.tmp.i = alloca %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", align 8
%Op.i = alloca %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", align 8
%0 = bitcast %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i to i8*
+ call void @llvm.lifetime.start(i64 24, i8* %0) #1
%retval.sroa.0.0.idx.i36 = getelementptr inbounds %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i, i64 0, i32 1, i32 0, i32 0
%retval.sroa.0.0.copyload.i37 = load i32, i32* %retval.sroa.0.0.idx.i36, align 8
call void @llvm.lifetime.end(i64 24, i8* %0) #1
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
index 0a1ea830a41d..db218f4bd097 100644
--- a/test/CodeGen/X86/misched-code-difference-with-debug.ll
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -49,10 +49,10 @@ entry:
%0 = load i8, i8* @argc, align 1
tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29), !dbg !DILocation(scope: !13)
%conv = sext i8 %0 to i32
- tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !DILocation(scope: !13)
+ tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !DIExpression(DW_OP_deref)), !dbg !DILocation(scope: !13)
%call = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv)
%1 = load i8, i8* @argc, align 1
- call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29), !dbg !DILocation(scope: !13)
+ call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !DIExpression(DW_OP_deref)), !dbg !DILocation(scope: !13)
%call2 = call i32 (%class.C*, i8, i8, i8, ...) @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv)
ret void
}
@@ -62,25 +62,24 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!22, !23}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, enums: !2, retainedTypes: !3, subprograms: !12, globals: !20, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, enums: !2, retainedTypes: !3, globals: !20, imports: !2, emissionKind: FullDebug)
!1 = !DIFile(filename: "test.cpp", directory: "")
!2 = !{}
!3 = !{!4}
!4 = !DICompositeType(tag: DW_TAG_class_type, name: "C", line: 2, size: 8, align: 8, file: !1, elements: !5, identifier: "_ZTS1C")
!5 = !{!6}
-!6 = !DISubprogram(name: "test", file: !1, scope: !"_ZTS1C", type: !7, isDefinition: false)
+!6 = !DISubprogram(name: "test", file: !1, scope: !4, type: !7, isDefinition: false)
!7 = !DISubroutineType(types: !8)
!8 = !{!9, !10, !11, !11, !11, null}
!9 = !DIBasicType(encoding: DW_ATE_signed, size: 32, align: 32, name: "int")
-!10 = !DIDerivedType(baseType: !"_ZTS1C", tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial)
+!10 = !DIDerivedType(baseType: !4, tag: DW_TAG_pointer_type, size: 64, align: 64, flags: DIFlagArtificial)
!11 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!12 = !{!13}
-!13 = distinct !DISubprogram(name: "test_with_debug", linkageName: "test_with_debug", line: 6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !1, scope: !14, type: !15, variables: !17)
+!13 = distinct !DISubprogram(name: "test_with_debug", linkageName: "test_with_debug", line: 6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 6, file: !1, scope: !14, type: !15, variables: !17)
!14 = !DIFile(filename: "test.cpp", directory: "")
!15 = !DISubroutineType(types: !16)
!16 = !{null}
!17 = !{!18, !19}
-!18 = !DILocalVariable(name: "c", line: 7, scope: !13, file: !14, type: !"_ZTS1C")
+!18 = !DILocalVariable(name: "c", line: 7, scope: !13, file: !14, type: !4)
!19 = !DILocalVariable(name: "lc", line: 8, scope: !13, file: !14, type: !11)
!20 = !{!21}
!21 = !DIGlobalVariable(name: "argc", line: 1, isLocal: false, isDefinition: true, scope: null, file: !14, type: !11, variable: i8* @argc)
diff --git a/test/CodeGen/X86/misched-ilp.ll b/test/CodeGen/X86/misched-ilp.ll
index 4ca296ca92e5..2babae25ea49 100644
--- a/test/CodeGen/X86/misched-ilp.ll
+++ b/test/CodeGen/X86/misched-ilp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=nocona -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
;
; Basic verification of the ScheduleDAGILP metric.
;
diff --git a/test/CodeGen/X86/mmx-bitcast-fold.ll b/test/CodeGen/X86/mmx-bitcast-fold.ll
new file mode 100644
index 000000000000..fc7ce73a441e
--- /dev/null
+++ b/test/CodeGen/X86/mmx-bitcast-fold.ll
@@ -0,0 +1,12 @@
+; RUN: opt -mtriple=x86_64-- -early-cse < %s -S | FileCheck %s
+
+; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
+
+define void @bar() {
+entry:
+ %0 = bitcast double 0.0 to x86_mmx
+ %1 = call x86_mmx @foo(x86_mmx %0)
+ ret void
+}
+
+declare x86_mmx @foo(x86_mmx)
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index 07d497b9f0a9..8e964bf16898 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -59,9 +59,10 @@ entry:
%0 = load i64, i64 addrspace(256)* %p
%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0
%1 = bitcast <2 x i64> %tmp2 to <8 x i16>
- %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone
- %3 = bitcast <4 x i32> %2 to <2 x i64>
- ret <2 x i64> %3
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i16> %2 to <4 x i32>
+ %4 = bitcast <4 x i32> %3 to <2 x i64>
+ ret <2 x i64> %4
}
; The two loads here both look identical to selection DAG, except for their
@@ -90,5 +91,3 @@ entry:
%tmp4 = add i32 %tmp1, %tmp3
ret i32 %tmp4
}
-
-declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/movmsk.ll b/test/CodeGen/X86/movmsk.ll
index a7ebebca4b72..1caa22a15947 100644
--- a/test/CodeGen/X86/movmsk.ll
+++ b/test/CodeGen/X86/movmsk.ll
@@ -1,12 +1,17 @@
-; RUN: llc -mcpu=core2 < %s | FileCheck %s
-; ModuleID = '<stdin>'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.6"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.6.6 -mattr=+sse4.1 | FileCheck %s
%0 = type { double }
%union.anon = type { float }
define i32 @double_signbit(double %d1) nounwind uwtable readnone ssp {
+; CHECK-LABEL: double_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskpd %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca double, align 8
%__u.i = alloca %0, align 8
@@ -16,15 +21,20 @@ entry:
%__f.i = getelementptr inbounds %0, %0* %__u.i, i64 0, i32 0
store double %d1, double* %__f.i, align 8
%tmp = bitcast double %d1 to i64
-; CHECK-NOT: shr
-; CHECK: movmskpd
-; CHECK-NEXT: and
%tmp1 = lshr i64 %tmp, 63
%shr.i = trunc i64 %tmp1 to i32
ret i32 %shr.i
}
define i32 @double_add_signbit(double %d1, double %d2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: double_add_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: addsd %xmm1, %xmm0
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskpd %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca double, align 8
%__u.i = alloca %0, align 8
@@ -35,15 +45,19 @@ entry:
%__f.i = getelementptr inbounds %0, %0* %__u.i, i64 0, i32 0
store double %add, double* %__f.i, align 8
%tmp = bitcast double %add to i64
-; CHECK-NOT: shr
-; CHECK: movmskpd
-; CHECK-NEXT: and
%tmp1 = lshr i64 %tmp, 63
%shr.i = trunc i64 %tmp1 to i32
ret i32 %shr.i
}
define i32 @float_signbit(float %f1) nounwind uwtable readnone ssp {
+; CHECK-LABEL: float_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskps %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca float, align 4
%__u.i = alloca %union.anon, align 4
@@ -53,14 +67,19 @@ entry:
%__f.i = getelementptr inbounds %union.anon, %union.anon* %__u.i, i64 0, i32 0
store float %f1, float* %__f.i, align 4
%2 = bitcast float %f1 to i32
-; CHECK-NOT: shr
-; CHECK: movmskps
-; CHECK-NEXT: and
%shr.i = lshr i32 %2, 31
ret i32 %shr.i
}
define i32 @float_add_signbit(float %f1, float %f2) nounwind uwtable readnone ssp {
+; CHECK-LABEL: float_add_signbit:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: addss %xmm1, %xmm0
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movmskps %xmm0, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retq
entry:
%__x.addr.i = alloca float, align 4
%__u.i = alloca %union.anon, align 4
@@ -71,21 +90,21 @@ entry:
%__f.i = getelementptr inbounds %union.anon, %union.anon* %__u.i, i64 0, i32 0
store float %add, float* %__f.i, align 4
%2 = bitcast float %add to i32
-; CHECK-NOT: shr
-; CHECK: movmskps
-; CHECK-NEXT: and
%shr.i = lshr i32 %2, 31
ret i32 %shr.i
}
; PR11570
-define void @float_call_signbit(double %n) {
-entry:
; FIXME: This should also use movmskps; we don't form the FGETSIGN node
; in this case, though.
+define void @float_call_signbit(double %n) {
; CHECK-LABEL: float_call_signbit:
-; CHECK: movd %xmm0, %rdi
-; FIXME
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movd %xmm0, %rdi
+; CHECK-NEXT: shrq $63, %rdi
+; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<kill>
+; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL
+entry:
%t0 = bitcast double %n to i64
%tobool.i.i.i.i = icmp slt i64 %t0, 0
tail call void @float_call_signbit_callee(i1 zeroext %tobool.i.i.i.i)
@@ -98,10 +117,12 @@ declare void @float_call_signbit_callee(i1 zeroext)
; movmskp{s|d} only set low 4/2 bits, high bits are known zero
define i32 @t1(<4 x float> %x, i32* nocapture %indexTable) nounwind uwtable readonly ssp {
-entry:
; CHECK-LABEL: t1:
-; CHECK: movmskps
-; CHECK-NOT: movslq
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movmskps %xmm0, %eax
+; CHECK-NEXT: movl (%rdi,%rax,4), %eax
+; CHECK-NEXT: retq
+entry:
%0 = tail call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %x) nounwind
%idxprom = sext i32 %0 to i64
%arrayidx = getelementptr inbounds i32, i32* %indexTable, i64 %idxprom
@@ -110,10 +131,12 @@ entry:
}
define i32 @t2(<4 x float> %x, i32* nocapture %indexTable) nounwind uwtable readonly ssp {
-entry:
; CHECK-LABEL: t2:
-; CHECK: movmskpd
-; CHECK-NOT: movslq
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movmskpd %xmm0, %eax
+; CHECK-NEXT: movl (%rdi,%rax,4), %eax
+; CHECK-NEXT: retq
+entry:
%0 = bitcast <4 x float> %x to <2 x double>
%1 = tail call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %0) nounwind
%idxprom = sext i32 %1 to i64
diff --git a/test/CodeGen/X86/movpc32-check.ll b/test/CodeGen/X86/movpc32-check.ll
index 606af3c898f4..f50613e9c718 100644
--- a/test/CodeGen/X86/movpc32-check.ll
+++ b/test/CodeGen/X86/movpc32-check.ll
@@ -19,11 +19,10 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
!llvm.module.flags = !{!7, !8, !9}
!llvm.ident = !{!10}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 3490ab8630d5643f71f1f04e46984f05b27b8d67) (http://llvm.org/git/llvm.git d2643e2ff955ed234944fe3c6b4ffc1250085843)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (http://llvm.org/git/clang.git 3490ab8630d5643f71f1f04e46984f05b27b8d67) (http://llvm.org/git/llvm.git d2643e2ff955ed234944fe3c6b4ffc1250085843)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "test.c", directory: "movpc-test")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, variables: !2)
+!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, unit: !0, variables: !2)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index de4c87cf30ad..d715ccfa8c69 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -1,6 +1,8 @@
; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=i686-windows -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64
; RUN: llc < %s -mtriple=i686-windows -stackrealign -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s -check-prefix=LINUX
%class.Class = type { i32 }
%struct.s = type { i64 }
@@ -11,26 +13,14 @@ declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32 %b, i32
declare void @oneparam(i32 %a)
declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
+declare void @inalloca(<{ %struct.s }>* inalloca)
-; Here, we should have a reserved frame, so we don't expect pushes
-; NORMAL-LABEL: test1:
-; NORMAL: subl $16, %esp
-; NORMAL-NEXT: movl $4, 12(%esp)
-; NORMAL-NEXT: movl $3, 8(%esp)
-; NORMAL-NEXT: movl $2, 4(%esp)
-; NORMAL-NEXT: movl $1, (%esp)
-; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-define void @test1() {
-entry:
- call void @good(i32 1, i32 2, i32 3, i32 4)
- ret void
-}
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
-; We're optimizing for code size, so we should get pushes for x86,
-; even though there is a reserved call frame.
-; Make sure we don't touch x86-64
-; NORMAL-LABEL: test1b:
+; We should get pushes for x86, even though there is a reserved call frame.
+; Make sure we don't touch x86-64, and that turning it off works.
+; NORMAL-LABEL: test1:
; NORMAL-NOT: subl {{.*}} %esp
; NORMAL: pushl $4
; NORMAL-NEXT: pushl $3
@@ -38,28 +28,21 @@ entry:
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $16, %esp
-; X64-LABEL: test1b:
+; X64-LABEL: test1:
; X64: movl $1, %ecx
; X64-NEXT: movl $2, %edx
; X64-NEXT: movl $3, %r8d
; X64-NEXT: movl $4, %r9d
; X64-NEXT: callq good
-define void @test1b() optsize {
-entry:
- call void @good(i32 1, i32 2, i32 3, i32 4)
- ret void
-}
-
-; Same as above, but for minsize
-; NORMAL-LABEL: test1c:
-; NORMAL-NOT: subl {{.*}} %esp
-; NORMAL: pushl $4
-; NORMAL-NEXT: pushl $3
-; NORMAL-NEXT: pushl $2
-; NORMAL-NEXT: pushl $1
-; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-define void @test1c() minsize {
+; NOPUSH-LABEL: test1:
+; NOPUSH: subl $16, %esp
+; NOPUSH-NEXT: movl $4, 12(%esp)
+; NOPUSH-NEXT: movl $3, 8(%esp)
+; NOPUSH-NEXT: movl $2, 4(%esp)
+; NOPUSH-NEXT: movl $1, (%esp)
+; NOPUSH-NEXT: call
+; NOPUSH-NEXT: addl $16, %esp
+define void @test1() {
entry:
call void @good(i32 1, i32 2, i32 3, i32 4)
ret void
@@ -245,8 +228,7 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
-; NORMAL-NEXT: addl $16, %esp
-; NORMAL-NEXT: subl $20, %esp
+; NORMAL-NEXT: subl $4, %esp
; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]]
; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]]
; NORMAL-NEXT: movl [[E2]], 4(%esp)
@@ -283,7 +265,7 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: calll *16(%esp)
-; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: addl $24, %esp
define void @test10() optsize {
%stack_fptr = alloca void (i32, i32, i32, i32)*
store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr
@@ -336,8 +318,7 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: calll _good
-; NORMAL-NEXT: addl $16, %esp
-; NORMAL-NEXT: subl $20, %esp
+; NORMAL-NEXT: subl $4, %esp
; NORMAL: movl $8, 16(%esp)
; NORMAL-NEXT: movl $7, 12(%esp)
; NORMAL-NEXT: movl $6, 8(%esp)
@@ -380,3 +361,54 @@ entry:
call void @good(i32 %val1, i32 %val2, i32 %val3, i32 %add)
ret i32* %ptr3
}
+
+; Make sure to fold adjacent stack adjustments.
+; LINUX-LABEL: pr27140:
+; LINUX: subl $12, %esp
+; LINUX: .cfi_def_cfa_offset 16
+; LINUX-NOT: sub
+; LINUX: pushl $4
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: pushl $3
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: pushl $2
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: pushl $1
+; LINUX: .cfi_adjust_cfa_offset 4
+; LINUX: calll good
+; LINUX: addl $28, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX-NOT: add
+; LINUX: retl
+define void @pr27140() optsize {
+entry:
+ tail call void @good(i32 1, i32 2, i32 3, i32 4)
+ ret void
+}
+
+; Check that a stack restore (leal -4(%ebp), %esp) doesn't get merged with a
+; stack adjustment (addl $12, %esp). Just because it's a lea doesn't mean it's
+; simply decreasing the stack pointer.
+; NORMAL-LABEL: test14:
+; NORMAL: calll _B_func
+; NORMAL: leal -4(%ebp), %esp
+; NORMAL-NOT: %esp
+; NORMAL: retl
+%struct.A = type { i32, i32 }
+%struct.B = type { i8 }
+declare x86_thiscallcc %struct.B* @B_ctor(%struct.B* returned, %struct.A* byval)
+declare void @B_func(%struct.B* sret, %struct.B*, i32)
+define void @test14(%struct.A* %a) {
+entry:
+ %ref.tmp = alloca %struct.B, align 1
+ %agg.tmp = alloca i64, align 4
+ %tmpcast = bitcast i64* %agg.tmp to %struct.A*
+ %tmp = alloca %struct.B, align 1
+ %0 = bitcast %struct.A* %a to i64*
+ %1 = load i64, i64* %0, align 4
+ store i64 %1, i64* %agg.tmp, align 4
+ %call = call x86_thiscallcc %struct.B* @B_ctor(%struct.B* %ref.tmp, %struct.A* byval %tmpcast)
+ %2 = getelementptr inbounds %struct.B, %struct.B* %tmp, i32 0, i32 0
+ call void @B_func(%struct.B* sret %tmp, %struct.B* %ref.tmp, i32 1)
+ ret void
+}
diff --git a/test/CodeGen/X86/movtopush64.ll b/test/CodeGen/X86/movtopush64.ll
new file mode 100644
index 000000000000..1f4aa18c3227
--- /dev/null
+++ b/test/CodeGen/X86/movtopush64.ll
@@ -0,0 +1,193 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=NORMAL -check-prefix=NORMALFP
+; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=NOPUSH
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=NOPUSH -check-prefix=NORMALFP
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
+
+declare void @seven_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g)
+declare void @ten_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i64 %h, i32 %i, i64 %j)
+declare void @ten_params_ptr(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i8* %h, i32 %i, i64 %j)
+declare void @cannot_push(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i)
+
+; We should get pushes for the last 4 parameters. Test that the
+; in-register parameters are all in the right places, and check
+; that the stack manipulations are correct and correctly
+; described by the DWARF directives. Test that the switch
+; to disable the optimization works and that the optimization
+; doesn't kick in on Windows64 where it is not allowed.
+; NORMAL-LABEL: test1
+; NORMAL: pushq
+; NORMAL-DAG: movl $1, %edi
+; NORMAL-DAG: movl $2, %esi
+; NORMAL-DAG: movl $3, %edx
+; NORMAL-DAG: movl $4, %ecx
+; NORMAL-DAG: movl $5, %r8d
+; NORMAL-DAG: movl $6, %r9d
+; NORMAL: pushq $10
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $9
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $8
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: pushq $7
+; NORMAL: .cfi_adjust_cfa_offset 8
+; NORMAL: callq ten_params
+; NORMAL: addq $32, %rsp
+; NORMAL: .cfi_adjust_cfa_offset -32
+; NORMAL: popq
+; NORMAL: retq
+; NOPUSH-LABEL: test1
+; NOPUSH-NOT: pushq
+; NOPUSH: retq
+define void @test1() {
+entry:
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ ret void
+}
+
+; The presence of a frame pointer should not prevent pushes. But we
+; don't need the CFI directives in that case.
+; Also check that we generate the right pushes for >8bit immediates.
+; NORMALFP-LABEL: test2
+; NORMALFP: pushq $10000
+; NORMALFP-NEXT: pushq $9000
+; NORMALFP-NEXT: pushq $8000
+; NORMALFP-NEXT: pushq $7000
+; NORMALFP-NEXT: callq {{_?}}ten_params
+define void @test2(i32 %k) {
+entry:
+ %a = alloca i32, i32 %k
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7000, i64 8000, i32 9000, i64 10000)
+ ret void
+}
+
+; Parameters 7 & 8 should push a 64-bit register.
+; TODO: Note that the regular expressions disallow r8 and r9. That's fine for
+; now, because the pushes will always follow the moves into r8 and r9.
+; Eventually, though, we want to be able to schedule the pushes better.
+; In this example, it will save two copies, because we have to move the
+; incoming parameters out of %rdi and %rsi to make room for the outgoing
+; parameters.
+; NORMAL-LABEL: test3
+; NORMAL: pushq $10000
+; NORMAL: pushq $9000
+; NORMAL: pushq %r{{..}}
+; NORMAL: pushq %r{{..}}
+; NORMAL: callq ten_params
+define void @test3(i32 %a, i64 %b) {
+entry:
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %a, i64 %b, i32 9000, i64 10000)
+ ret void
+}
+
+; Check that we avoid the optimization for just one push.
+; NORMAL-LABEL: test4
+; NORMAL: movl $7, (%rsp)
+; NORMAL: callq seven_params
+define void @test4() {
+entry:
+ call void @seven_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7)
+ ret void
+}
+
+; Check that pushing link-time constant addresses works correctly
+; NORMAL-LABEL: test5
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq $ext
+; NORMAL: pushq $7
+; NORMAL: callq ten_params_ptr
+@ext = external constant i8
+define void @test5() {
+entry:
+ call void @ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i8* @ext, i32 9, i64 10)
+ ret void
+}
+
+; Check that we fuse 64-bit loads but not 32-bit loads into PUSH mem.
+; NORMAL-LABEL: test6
+; NORMAL: movq %rsi, [[REG64:%.+]]
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq ([[REG64]])
+; NORMAL: pushq {{%r..}}
+; NORMAL: callq ten_params
+define void @test6(i32* %p32, i64* %p64) {
+entry:
+ %v32 = load i32, i32* %p32
+ %v64 = load i64, i64* %p64
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 %v32, i64 %v64, i32 9, i64 10)
+ ret void
+}
+
+; Fold stack-relative loads into the push with correct offsets.
+; Do the same for an indirect call whose address is loaded from the stack.
+; On entry, %p7 is at 8(%rsp) and %p8 is at 16(%rsp). Prior to the call
+; sequence, 72 bytes are allocated to the stack, 48 for register saves and
+; 24 for local storage and alignment, so %p7 is at 80(%rsp) and %p8 is at
+; 88(%rsp). The call address can be stored anywhere in the local space but
+; happens to be stored at 8(%rsp). Each push bumps these offsets up by
+; 8 bytes.
+; NORMAL-LABEL: test7
+; NORMAL: movq %r{{.*}}, 8(%rsp) {{.*Spill$}}
+; NORMAL: pushq 88(%rsp)
+; NORMAL: pushq $9
+; NORMAL: pushq 96(%rsp)
+; NORMAL: pushq $7
+; NORMAL: callq *40(%rsp)
+define void @test7(i64 %p1, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6, i64 %p7, i64 %p8) {
+entry:
+ %stack_fptr = alloca void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)*
+ store void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)* @ten_params, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr
+ %ten_params_ptr = load volatile void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)*, void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64)** %stack_fptr
+ call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ call void (i32, i64, i32, i64, i32, i64, i32, i64, i32, i64) %ten_params_ptr(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %p7, i32 9, i64 %p8)
+ ret void
+}
+
+; We can't fold the load from the global into the push because of
+; interference from the store
+; NORMAL-LABEL: test8
+; NORMAL: movq the_global(%rip), [[REG:%r.+]]
+; NORMAL: movq $42, the_global
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq [[REG]]
+; NORMAL: pushq $7
+; NORMAL: callq ten_params
+@the_global = external global i64
+define void @test8() {
+ %myload = load i64, i64* @the_global
+ store i64 42, i64* @the_global
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 %myload, i32 9, i64 10)
+ ret void
+}
+
+
+; Converting one function call to use pushes negatively affects
+; other calls that pass arguments on the stack without pushes.
+; If the cost outweighs the benefit, avoid using pushes.
+; NORMAL-LABEL: test9
+; NORMAL: callq cannot_push
+; NORMAL-NOT: push
+; NORMAL: callq ten_params
+define void @test9(float %p1) {
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ ret void
+}
+
+; But if the benefit outweighs the cost, use pushes.
+; NORMAL-LABEL: test10
+; NORMAL: callq cannot_push
+; NORMAL: pushq $10
+; NORMAL: pushq $9
+; NORMAL: pushq $8
+; NORMAL: pushq $7
+; NORMAL: callq ten_params
+define void @test10(float %p1) {
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ call void @cannot_push(float 1.0e0, float 2.0e0, float 3.0e0, float 4.0e0, float 5.0e0, float 6.0e0, float 7.0e0, float 8.0e0, float %p1)
+ call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
+ ret void
+}
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
new file mode 100644
index 000000000000..8b8b10aa1790
--- /dev/null
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i256* %a, i256* %b, i256* %out) #0 {
+entry:
+ %av = load i256, i256* %a
+ %bv = load i256, i256* %b
+ %r = mul i256 %av, %bv
+ store i256 %r, i256* %out
+ ret void
+}
+
+; CHECK-LABEL: @test
+; There is a lot of inter-register motion, and so matching the instruction
+; sequence will be fragile. There should be 6 underlying multiplications.
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK-NOT: imulq
+; CHECK: retq
+
+attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+
diff --git a/test/CodeGen/X86/mul128.ll b/test/CodeGen/X86/mul128.ll
index 6825b99f2425..2b3a13509b3c 100644
--- a/test/CodeGen/X86/mul128.ll
+++ b/test/CodeGen/X86/mul128.ll
@@ -1,6 +1,17 @@
-; RUN: llc < %s -march=x86-64 | grep mul | count 3
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
define i128 @foo(i128 %t, i128 %u) {
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: imulq %rdi, %rcx
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: addq %rcx, %rdx
+; X64-NEXT: imulq %r8, %rsi
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: retq
%k = mul i128 %t, %u
ret i128 %k
}
diff --git a/test/CodeGen/X86/mul64.ll b/test/CodeGen/X86/mul64.ll
index 5a25c5d0e9de..f5ca52a93b51 100644
--- a/test/CodeGen/X86/mul64.ll
+++ b/test/CodeGen/X86/mul64.ll
@@ -1,6 +1,27 @@
-; RUN: llc < %s -march=x86 | grep mul | count 3
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
-define i64 @foo(i64 %t, i64 %u) {
+define i64 @foo(i64 %t, i64 %u) nounwind {
+; X32-LABEL: foo:
+; X32: # BB#0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: imulq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%k = mul i64 %t, %u
ret i64 %k
}
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 247d78776b80..b3f73aaf890b 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
-; RUN: llc < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LINUX-X32
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -enable-tail-merge=0 -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86
; Test that we actually spill and reload all arguments in the variadic argument
; pack. Doing a normal call will clobber all argument registers, and we will
@@ -136,6 +137,8 @@ define void @g_thunk(i8* %fptr_i8, ...) {
; WINDOWS: jmpq *%rcx # TAILCALL
; X86-LABEL: _g_thunk:
+; X86-NOT: push %ebp
+; X86-NOT: andl {{.*}}, %esp
; X86: jmpl *%eax # TAILCALL
; Do a simple multi-exit multi-bb test.
diff --git a/test/CodeGen/X86/mwaitx.ll b/test/CodeGen/X86/mwaitx.ll
new file mode 100644
index 000000000000..5bf64311282f
--- /dev/null
+++ b/test/CodeGen/X86/mwaitx.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+mwaitx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+mwaitx | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=bdver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=bdver4 | FileCheck %s -check-prefix=WIN64
+
+; CHECK-LABEL: foo:
+; CHECK: leaq (%rdi), %rax
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: monitorx
+; WIN64-LABEL: foo:
+; WIN64: leaq (%rcx), %rax
+; WIN64-NEXT: movl %edx, %ecx
+; WIN64-NEXT: movl %r8d, %edx
+; WIN64-NEXT: monitorx
+define void @foo(i8* %P, i32 %E, i32 %H) nounwind {
+entry:
+ tail call void @llvm.x86.monitorx(i8* %P, i32 %E, i32 %H)
+ ret void
+}
+
+declare void @llvm.x86.monitorx(i8*, i32, i32) nounwind
+
+; CHECK-LABEL: bar:
+; CHECK: movl %edi, %ecx
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movl %edx, %ebx
+; CHECK-NEXT: mwaitx
+; WIN64-LABEL: bar:
+; WIN64: movl %edx, %eax
+; WIN64: movl %r8d, %ebx
+; WIN64-NEXT: mwaitx
+define void @bar(i32 %E, i32 %H, i32 %C) nounwind {
+entry:
+ tail call void @llvm.x86.mwaitx(i32 %E, i32 %H, i32 %C)
+ ret void
+}
+
+declare void @llvm.x86.mwaitx(i32, i32, i32) nounwind
diff --git a/test/CodeGen/X86/negate-add-zero.ll b/test/CodeGen/X86/negate-add-zero.ll
index 06341dc7ba53..5911312053dd 100644
--- a/test/CodeGen/X86/negate-add-zero.ll
+++ b/test/CodeGen/X86/negate-add-zero.ll
@@ -1133,4 +1133,4 @@ declare %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZlsIdLi5EL
declare %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZlsIdLi5ELi6EERSoS0_RK15FixedMatrixBaseIT_XT0_EXT1_EE(%"struct.std::basic_ostream<char,std::char_traits<char> >"*, %"struct.FixedMatrixBase<double,5,6>"*)
-declare void @llvm.memset.i64(i8* nocapture, i8, i64, i32) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/negative-offset.ll b/test/CodeGen/X86/negative-offset.ll
new file mode 100644
index 000000000000..dc1b255d0202
--- /dev/null
+++ b/test/CodeGen/X86/negative-offset.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test that a constant consisting of a global symbol with a negative offset
+; is properly folded and isel'd.
+
+; CHECK-LABEL: negative_offset:
+; CHECK: movl $G, %eax
+; CHECK: notq %rax
+; CHECK: addq %rdi, %rax
+; CHECK: retq
+@G = external global [8 x i32]
+define i8* @negative_offset(i8* %a) {
+ %t = getelementptr i8, i8* %a, i64 sub (i64 -1, i64 ptrtoint ([8 x i32]* @G to i64))
+ ret i8* %t
+}
diff --git a/test/CodeGen/X86/new-remat.ll b/test/CodeGen/X86/new-remat.ll
new file mode 100644
index 000000000000..726ad2d0a127
--- /dev/null
+++ b/test/CodeGen/X86/new-remat.ll
@@ -0,0 +1,70 @@
+; RUN: llc -verify-regalloc < %s | FileCheck %s
+; Check all spills are rematerialized.
+; CHECK-NOT: Spill
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global double 0.000000e+00, align 8
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind uwtable
+define i32 @uniform_testdata(i32 %p1) {
+entry:
+ %cmp3 = icmp sgt i32 %p1, 0
+ br i1 %cmp3, label %for.body.preheader, label %for.end
+
+for.body.preheader: ; preds = %entry
+ %tmp = add i32 %p1, -1
+ %xtraiter = and i32 %p1, 7
+ %lcmp.mod = icmp eq i32 %xtraiter, 0
+ br i1 %lcmp.mod, label %for.body.preheader.split, label %for.body.prol.preheader
+
+for.body.prol.preheader: ; preds = %for.body.preheader
+ br label %for.body.prol
+
+for.body.prol: ; preds = %for.body.prol, %for.body.prol.preheader
+ %i.04.prol = phi i32 [ %inc.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
+ %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.prol.preheader ]
+ %tmp1 = load double, double* @b, align 8
+ %call.prol = tail call double @pow(double %tmp1, double 2.500000e-01)
+ %inc.prol = add nuw nsw i32 %i.04.prol, 1
+ %prol.iter.sub = add i32 %prol.iter, -1
+ %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+ br i1 %prol.iter.cmp, label %for.body.preheader.split.loopexit, label %for.body.prol
+
+for.body.preheader.split.loopexit: ; preds = %for.body.prol
+ %inc.prol.lcssa = phi i32 [ %inc.prol, %for.body.prol ]
+ br label %for.body.preheader.split
+
+for.body.preheader.split: ; preds = %for.body.preheader.split.loopexit, %for.body.preheader
+ %i.04.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.prol.lcssa, %for.body.preheader.split.loopexit ]
+ %tmp2 = icmp ult i32 %tmp, 7
+ br i1 %tmp2, label %for.end.loopexit, label %for.body.preheader.split.split
+
+for.body.preheader.split.split: ; preds = %for.body.preheader.split
+ br label %for.body
+
+for.body: ; preds = %for.body, %for.body.preheader.split.split
+ %i.04 = phi i32 [ %i.04.unr, %for.body.preheader.split.split ], [ %inc.7, %for.body ]
+ %tmp3 = load double, double* @b, align 8
+ %call = tail call double @pow(double %tmp3, double 2.500000e-01)
+ %tmp4 = load double, double* @b, align 8
+ %call.1 = tail call double @pow(double %tmp4, double 2.500000e-01)
+ %inc.7 = add nsw i32 %i.04, 8
+ %exitcond.7 = icmp eq i32 %inc.7, %p1
+ br i1 %exitcond.7, label %for.end.loopexit.unr-lcssa, label %for.body
+
+for.end.loopexit.unr-lcssa: ; preds = %for.body
+ br label %for.end.loopexit
+
+for.end.loopexit: ; preds = %for.end.loopexit.unr-lcssa, %for.body.preheader.split
+ br label %for.end
+
+for.end: ; preds = %for.end.loopexit, %entry
+ %tmp5 = load i32, i32* @a, align 4
+ ret i32 %tmp5
+}
+
+; Function Attrs: nounwind
+declare double @pow(double, double)
diff --git a/test/CodeGen/X86/no-prolog-kill.ll b/test/CodeGen/X86/no-prolog-kill.ll
new file mode 100644
index 000000000000..f625f315bb7c
--- /dev/null
+++ b/test/CodeGen/X86/no-prolog-kill.ll
@@ -0,0 +1,21 @@
+; RUN: llc -verify-machineinstrs -o - %s | FileCheck %s
+target triple = "x86_64--"
+
+; This function gets a AL live-in and at same time saves+restores RAX. We must
+; not add a kill flag to the "PUSHQ %rax" or the machine verifier will complain.
+; CHECK-LABEL: test:
+; CHECK: pushq %rax
+; CHECK: testb %al, %al
+; CHECK: je .LBB
+define void @test(i64 %a, i8* %b, ...) {
+entry:
+ %bar = alloca i8
+ call void @llvm.va_start(i8* %bar)
+ call void @llvm.eh.unwind.init()
+ call void @llvm.eh.return.i64(i64 %a, i8* %b)
+ unreachable
+}
+
+declare void @llvm.eh.return.i64(i64, i8*)
+declare void @llvm.eh.unwind.init()
+declare void @llvm.va_start(i8*)
diff --git a/test/CodeGen/X86/no-sse2-avg.ll b/test/CodeGen/X86/no-sse2-avg.ll
new file mode 100644
index 000000000000..0ed0a7f74cb3
--- /dev/null
+++ b/test/CodeGen/X86/no-sse2-avg.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s
+
+define <16 x i8> @PR27973() {
+; CHECK-LABEL: PR27973:
+; CHECK: # BB#0:
+; CHECK-NEXT: movb $0, 15(%rdi)
+; CHECK-NEXT: movb $0, 14(%rdi)
+; CHECK-NEXT: movb $0, 13(%rdi)
+; CHECK-NEXT: movb $0, 12(%rdi)
+; CHECK-NEXT: movb $0, 11(%rdi)
+; CHECK-NEXT: movb $0, 10(%rdi)
+; CHECK-NEXT: movb $0, 9(%rdi)
+; CHECK-NEXT: movb $0, 8(%rdi)
+; CHECK-NEXT: movb $0, 7(%rdi)
+; CHECK-NEXT: movb $0, 6(%rdi)
+; CHECK-NEXT: movb $0, 5(%rdi)
+; CHECK-NEXT: movb $0, 4(%rdi)
+; CHECK-NEXT: movb $0, 3(%rdi)
+; CHECK-NEXT: movb $0, 2(%rdi)
+; CHECK-NEXT: movb $0, 1(%rdi)
+; CHECK-NEXT: movb $0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+;
+ %t0 = zext <16 x i8> zeroinitializer to <16 x i32>
+ %t1 = add nuw nsw <16 x i32> %t0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %t2 = lshr <16 x i32> %t1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %t3 = trunc <16 x i32> %t2 to <16 x i8>
+ ret <16 x i8> %t3
+}
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index c9767f88488c..e221f8e9520b 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -1,54 +1,235 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=SSE --check-prefix=SSE4A
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=VLX
; Make sure that we generate non-temporal stores for the test cases below.
; We use xorps for zeroing, so domain information isn't available anymore.
+; Scalar versions (zeroing means we can this even for fp types).
+
+define void @test_zero_f32(float* %dst) {
+; SSE-LABEL: test_zero_f32:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntil %eax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_f32:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_f32:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ store float zeroinitializer, float* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_zero_i32(i32* %dst) {
+; SSE-LABEL: test_zero_i32:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntil %eax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_i32:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_i32:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ store i32 zeroinitializer, i32* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_zero_f64(double* %dst) {
+; SSE-LABEL: test_zero_f64:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_f64:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_f64:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntiq %rax, (%rdi)
+; VLX-NEXT: retq
+ store double zeroinitializer, double* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_zero_i64(i64* %dst) {
+; SSE-LABEL: test_zero_i64:
+; SSE: # BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_i64:
+; AVX: # BB#0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_i64:
+; VLX: # BB#0:
+; VLX-NEXT: xorl %eax, %eax
+; VLX-NEXT: movntiq %rax, (%rdi)
+; VLX-NEXT: retq
+ store i64 zeroinitializer, i64* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+; And now XMM versions.
+
define void @test_zero_v4f32(<4 x float>* %dst) {
-; CHECK-LABEL: test_zero_v4f32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4f32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v4i32(<4 x i32>* %dst) {
-; CHECK-LABEL: test_zero_v4i32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
+ store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v2f64(<2 x double>* %dst) {
-; CHECK-LABEL: test_zero_v2f64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v2f64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v2i64(<2 x i64>* %dst) {
-; CHECK-LABEL: test_zero_v2i64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v2i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v8i16(<8 x i16>* %dst) {
-; CHECK-LABEL: test_zero_v8i16:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v8i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_zero_v16i8(<16 x i8>* %dst) {
-; CHECK-LABEL: test_zero_v16i8:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_zero_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v16i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
}
@@ -56,43 +237,145 @@ define void @test_zero_v16i8(<16 x i8>* %dst) {
; And now YMM versions.
define void @test_zero_v8f32(<8 x float>* %dst) {
-; CHECK-LABEL: test_zero_v8f32:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_zero_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v8i32(<8 x i32>* %dst) {
-; CHECK-LABEL: test_zero_v8i32:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8i32:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v8i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v4f64(<4 x double>* %dst) {
-; CHECK-LABEL: test_zero_v4f64:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_zero_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4f64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v4i64(<4 x i64>* %dst) {
-; CHECK-LABEL: test_zero_v4i64:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4i64:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v4i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v16i16(<16 x i16>* %dst) {
-; CHECK-LABEL: test_zero_v16i16:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v16i16:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v16i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_zero_v32i8(<32 x i8>* %dst) {
-; CHECK-LABEL: test_zero_v32i8:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_zero_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movntps %xmm0, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_zero_v32i8:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_zero_v32i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpxord %ymm0, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
}
@@ -100,50 +383,358 @@ define void @test_zero_v32i8(<32 x i8>* %dst) {
; Check that we also handle arguments. Here the type survives longer.
+; Scalar versions.
+
+define void @test_arg_f32(float %arg, float* %dst) {
+; SSE2-LABEL: test_arg_f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movss %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_arg_f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovss %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_f32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovss %xmm0, (%rdi)
+; VLX-NEXT: retq
+ store float %arg, float* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_arg_i32(i32 %arg, i32* %dst) {
+; SSE-LABEL: test_arg_i32:
+; SSE: # BB#0:
+; SSE-NEXT: movntil %edi, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movntil %edi, (%rsi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_i32:
+; VLX: # BB#0:
+; VLX-NEXT: movntil %edi, (%rsi)
+; VLX-NEXT: retq
+ store i32 %arg, i32* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_arg_f64(double %arg, double* %dst) {
+; SSE2-LABEL: test_arg_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_arg_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovsd %xmm0, (%rdi)
+; VLX-NEXT: retq
+ store double %arg, double* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_arg_i64(i64 %arg, i64* %dst) {
+; SSE-LABEL: test_arg_i64:
+; SSE: # BB#0:
+; SSE-NEXT: movntiq %rdi, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_i64:
+; AVX: # BB#0:
+; AVX-NEXT: movntiq %rdi, (%rsi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_i64:
+; VLX: # BB#0:
+; VLX-NEXT: movntiq %rdi, (%rsi)
+; VLX-NEXT: retq
+ store i64 %arg, i64* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+; Extract versions
+
+define void @test_extract_f32(<4 x float> %arg, float* %dst) {
+; SSE2-LABEL: test_extract_f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: movss %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_f32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: extractps $1, %xmm0, %eax
+; SSE41-NEXT: movntil %eax, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_f32:
+; AVX: # BB#0:
+; AVX-NEXT: vextractps $1, %xmm0, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_f32:
+; VLX: # BB#0:
+; VLX-NEXT: vextractps $1, %xmm0, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <4 x float> %arg, i32 1
+ store float %1, float* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_extract_i32(<4 x i32> %arg, i32* %dst) {
+; SSE2-LABEL: test_extract_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movntil %eax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_i32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE4A-NEXT: movd %xmm0, %eax
+; SSE4A-NEXT: movntil %eax, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrd $1, %xmm0, %eax
+; SSE41-NEXT: movntil %eax, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: movntil %eax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpextrd $1, %xmm0, %eax
+; VLX-NEXT: movntil %eax, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <4 x i32> %arg, i32 1
+ store i32 %1, i32* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_extract_f64(<2 x double> %arg, double* %dst) {
+; SSE2-LABEL: test_extract_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movhpd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movhpd %xmm0, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovhpd %xmm0, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <2 x double> %arg, i32 1
+ store double %1, double* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
+; SSE2-LABEL: test_extract_i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_i64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE4A-NEXT: movd %xmm0, %rax
+; SSE4A-NEXT: movntiq %rax, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_extract_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_extract_i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpextrq $1, %xmm0, %rax
+; VLX-NEXT: movntiq %rax, (%rdi)
+; VLX-NEXT: retq
+ %1 = extractelement <2 x i64> %arg, i32 1
+ store i64 %1, i64* %dst, align 1, !nontemporal !1
+ ret void
+}
+
+; And now XMM versions.
+
define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
-; CHECK-LABEL: test_arg_v4f32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4f32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
+; VLX-NEXT: retq
store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
-; CHECK-LABEL: test_arg_v4i32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4i32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
-; CHECK-LABEL: test_arg_v2f64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v2f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntpd %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
-; CHECK-LABEL: test_arg_v2i64:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v2i64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
-; CHECK-LABEL: test_arg_v8i16:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v8i16:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
-; CHECK-LABEL: test_arg_v16i8:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_arg_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v16i8:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
}
@@ -151,43 +742,127 @@ define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
; And now YMM versions.
define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
-; CHECK-LABEL: test_arg_v8f32:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_arg_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
-; CHECK-LABEL: test_arg_v8i32:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v8i32:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
-; CHECK-LABEL: test_arg_v4f64:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_arg_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4f64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntpd %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
-; CHECK-LABEL: test_arg_v4i64:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v4i64:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
-; CHECK-LABEL: test_arg_v16i16:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v16i16:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
-; CHECK-LABEL: test_arg_v32i8:
-; AVX2: vmovntps %ymm
+; SSE-LABEL: test_arg_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v32i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_arg_v32i8:
+; VLX: # BB#0:
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
}
@@ -197,54 +872,138 @@ define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
; We use an add to make the type survive all the way to the MOVNT.
define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
-; CHECK-LABEL: test_op_v4f32:
-; SSE: movntps
-; AVX: vmovntps
+; SSE-LABEL: test_op_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps %xmm1, %xmm0
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v4f32:
+; VLX: # BB#0:
+; VLX-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <4 x float> %a, %b
store <4 x float> %r, <4 x float>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
-; CHECK-LABEL: test_op_v4i32:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v4i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <4 x i32> %a, %b
store <4 x i32> %r, <4 x i32>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
-; CHECK-LABEL: test_op_v2f64:
-; SSE: movntpd
-; AVX: vmovntpd
+; SSE-LABEL: test_op_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntpd %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v2f64:
+; VLX: # BB#0:
+; VLX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntpd %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <2 x double> %a, %b
store <2 x double> %r, <2 x double>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
-; CHECK-LABEL: test_op_v2i64:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v2i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <2 x i64> %a, %b
store <2 x i64> %r, <2 x i64>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
-; CHECK-LABEL: test_op_v8i16:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v8i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <8 x i16> %a, %b
store <8 x i16> %r, <8 x i16>* %dst, align 16, !nontemporal !1
ret void
}
define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
-; CHECK-LABEL: test_op_v16i8:
-; SSE: movntdq
-; AVX: vmovntdq
+; SSE-LABEL: test_op_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v16i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; VLX-NEXT: vmovntdq %xmm0, (%rdi)
+; VLX-NEXT: retq
%r = add <16 x i8> %a, %b
store <16 x i8> %r, <16 x i8>* %dst, align 16, !nontemporal !1
ret void
@@ -253,48 +1012,200 @@ define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
; And now YMM versions.
define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
-; CHECK-LABEL: test_op_v8f32:
-; AVX: vmovntps %ymm
+; SSE-LABEL: test_op_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps %xmm2, %xmm0
+; SSE-NEXT: addps %xmm3, %xmm1
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovntps %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntps %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
-; CHECK-LABEL: test_op_v8i32:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v8i32:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <8 x i32> %a, %b
store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
-; CHECK-LABEL: test_op_v4f64:
-; AVX: vmovntpd %ymm
+; SSE-LABEL: test_op_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd %xmm2, %xmm0
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: movntpd %xmm1, 16(%rdi)
+; SSE-NEXT: movntpd %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_op_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovntpd %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_op_v4f64:
+; VLX: # BB#0:
+; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntpd %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <4 x double> %a, %b
store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
-; CHECK-LABEL: test_op_v4i64:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v4i64:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <4 x i64> %a, %b
store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
-; CHECK-LABEL: test_op_v16i16:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: paddw %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v16i16:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <16 x i16> %a, %b
store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
ret void
}
define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
-; CHECK-LABEL: test_op_v32i8:
-; AVX2: vmovntdq %ymm
+; SSE-LABEL: test_op_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb %xmm2, %xmm0
+; SSE-NEXT: paddb %xmm3, %xmm1
+; SSE-NEXT: movntdq %xmm1, 16(%rdi)
+; SSE-NEXT: movntdq %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_op_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovntps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_op_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; VLX-LABEL: test_op_v32i8:
+; VLX: # BB#0:
+; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovntdq %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = add <32 x i8> %a, %b
store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
ret void
@@ -305,11 +1216,26 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
; could even scalarize to movnti when we have 1-alignment: nontemporal is
; probably always worth even some 20 instruction scalarization.
define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
-; CHECK-LABEL: test_unaligned_v8f32:
-; SSE: movntps %xmm
-; SSE: movntps %xmm
-; AVX-NOT: movnt
-; AVX: vmovups %ymm
+; SSE-LABEL: test_unaligned_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps %xmm2, %xmm0
+; SSE-NEXT: addps %xmm3, %xmm1
+; SSE-NEXT: movntps %xmm1, 16(%rdi)
+; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; VLX-LABEL: test_unaligned_v8f32:
+; VLX: # BB#0:
+; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; VLX-NEXT: vmovups %ymm0, (%rdi)
+; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
ret void
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
new file mode 100644
index 000000000000..83301e60a1c4
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -0,0 +1,1638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+
+define <4 x float> @test_v4f32(<4 x float>* %src) {
+; SSE2-LABEL: test_v4f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v4f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1
+ ret <4 x float> %1
+}
+
+define <4 x i32> @test_v4i32(<4 x i32>* %src) {
+; SSE2-LABEL: test_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_v4i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v4i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa32 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %src, align 16, !nontemporal !1
+ ret <4 x i32> %1
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %src) {
+; SSE2-LABEL: test_v2f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1
+ ret <2 x double> %1
+}
+
+define <2 x i64> @test_v2i64(<2 x i64>* %src) {
+; SSE2-LABEL: test_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1
+ ret <2 x i64> %1
+}
+
+define <8 x i16> @test_v8i16(<8 x i16>* %src) {
+; SSE2-LABEL: test_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @test_v16i8(<16 x i8>* %src) {
+; SSE2-LABEL: test_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1
+ ret <16 x i8> %1
+}
+
+; And now YMM versions.
+
+define <8 x float> @test_v8f32(<8 x float>* %src) {
+; SSE2-LABEL: test_v8f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1
+ ret <8 x float> %1
+}
+
+define <8 x i32> @test_v8i32(<8 x i32>* %src) {
+; SSE2-LABEL: test_v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_v8i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v8i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqa32 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1
+ ret <8 x i32> %1
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %src) {
+; SSE2-LABEL: test_v4f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v4f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1
+ ret <4 x double> %1
+}
+
+define <4 x i64> @test_v4i64(<4 x i64>* %src) {
+; SSE2-LABEL: test_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v4i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1
+ ret <4 x i64> %1
+}
+
+define <16 x i16> @test_v16i16(<16 x i16>* %src) {
+; SSE2-LABEL: test_v16i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1
+ ret <16 x i16> %1
+}
+
+define <32 x i8> @test_v32i8(<32 x i8>* %src) {
+; SSE2-LABEL: test_v32i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v32i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1
+ ret <32 x i8> %1
+}
+
+; And now ZMM versions.
+
+define <16 x float> @test_v16f32(<16 x float>* %src) {
+; SSE2-LABEL: test_v16f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1
+ ret <16 x float> %1
+}
+
+define <16 x i32> @test_v16i32(<16 x i32>* %src) {
+; SSE2-LABEL: test_v16i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v16i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1
+ ret <16 x i32> %1
+}
+
+define <8 x double> @test_v8f64(<8 x double>* %src) {
+; SSE2-LABEL: test_v8f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1
+ ret <8 x double> %1
+}
+
+define <8 x i64> @test_v8i64(<8 x i64>* %src) {
+; SSE2-LABEL: test_v8i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v8i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1
+ ret <8 x i64> %1
+}
+
+define <32 x i16> @test_v32i16(<32 x i16>* %src) {
+; SSE2-LABEL: test_v32i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v32i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v32i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
+ ret <32 x i16> %1
+}
+
+define <64 x i8> @test_v64i8(<64 x i8>* %src) {
+; SSE2-LABEL: test_v64i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps (%rdi), %xmm0
+; SSE2-NEXT: movaps 16(%rdi), %xmm1
+; SSE2-NEXT: movaps 32(%rdi), %xmm2
+; SSE2-NEXT: movaps 48(%rdi), %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_v64i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: movntdqa (%rdi), %xmm0
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_v64i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
+ ret <64 x i8> %1
+}
+
+
+; Check cases where the load would be folded.
+
+define <4 x float> @test_arg_v4f32(<4 x float> %arg, <4 x float>* %src) {
+; SSE-LABEL: test_arg_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddps (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1
+ %2 = fadd <4 x float> %arg, %1
+ ret <4 x float> %2
+}
+
+define <4 x i32> @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %src) {
+; SSE-LABEL: test_arg_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %src, align 16, !nontemporal !1
+ %2 = add <4 x i32> %arg, %1
+ ret <4 x i32> %2
+}
+
+define <2 x double> @test_arg_v2f64(<2 x double> %arg, <2 x double>* %src) {
+; SSE-LABEL: test_arg_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v2f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddpd (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1
+ %2 = fadd <2 x double> %arg, %1
+ ret <2 x double> %2
+}
+
+define <2 x i64> @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %src) {
+; SSE-LABEL: test_arg_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddq (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v2i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1
+ %2 = add <2 x i64> %arg, %1
+ ret <2 x i64> %2
+}
+
+define <8 x i16> @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %src) {
+; SSE-LABEL: test_arg_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddw (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddw (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1
+ %2 = add <8 x i16> %arg, %1
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %src) {
+; SSE-LABEL: test_arg_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpaddb (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddb (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1
+ %2 = add <16 x i8> %arg, %1
+ ret <16 x i8> %2
+}
+
+; And now YMM versions.
+
+define <8 x float> @test_arg_v8f32(<8 x float> %arg, <8 x float>* %src) {
+; SSE-LABEL: test_arg_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps (%rdi), %xmm0
+; SSE-NEXT: addps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddps (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1
+ %2 = fadd <8 x float> %arg, %1
+ ret <8 x float> %2
+}
+
+define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
+; SSE-LABEL: test_arg_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd (%rdi), %xmm0
+; SSE-NEXT: paddd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1
+ %2 = add <8 x i32> %arg, %1
+ ret <8 x i32> %2
+}
+
+define <4 x double> @test_arg_v4f64(<4 x double> %arg, <4 x double>* %src) {
+; SSE-LABEL: test_arg_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd (%rdi), %xmm0
+; SSE-NEXT: addpd 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddpd (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1
+ %2 = fadd <4 x double> %arg, %1
+ ret <4 x double> %2
+}
+
+define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
+; SSE-LABEL: test_arg_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq (%rdi), %xmm0
+; SSE-NEXT: paddq 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1
+ %2 = add <4 x i64> %arg, %1
+ ret <4 x i64> %2
+}
+
+define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
+; SSE-LABEL: test_arg_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw (%rdi), %xmm0
+; SSE-NEXT: paddw 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1
+ %2 = add <16 x i16> %arg, %1
+ ret <16 x i16> %2
+}
+
+define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
+; SSE-LABEL: test_arg_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb (%rdi), %xmm0
+; SSE-NEXT: paddb 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1
+ %2 = add <32 x i8> %arg, %1
+ ret <32 x i8> %2
+}
+
+; And now ZMM versions.
+
+define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
+; SSE-LABEL: test_arg_v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: addps (%rdi), %xmm0
+; SSE-NEXT: addps 16(%rdi), %xmm1
+; SSE-NEXT: addps 32(%rdi), %xmm2
+; SSE-NEXT: addps 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v16f32:
+; AVX: # BB#0:
+; AVX-NEXT: vaddps (%rdi), %ymm0, %ymm0
+; AVX-NEXT: vaddps 32(%rdi), %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddps (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1
+ %2 = fadd <16 x float> %arg, %1
+ ret <16 x float> %2
+}
+
+define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
+; SSE-LABEL: test_arg_v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddd (%rdi), %xmm0
+; SSE-NEXT: paddd 16(%rdi), %xmm1
+; SSE-NEXT: paddd 32(%rdi), %xmm2
+; SSE-NEXT: paddd 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddd 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1
+ %2 = add <16 x i32> %arg, %1
+ ret <16 x i32> %2
+}
+
+define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
+; SSE-LABEL: test_arg_v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: addpd (%rdi), %xmm0
+; SSE-NEXT: addpd 16(%rdi), %xmm1
+; SSE-NEXT: addpd 32(%rdi), %xmm2
+; SSE-NEXT: addpd 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_arg_v8f64:
+; AVX: # BB#0:
+; AVX-NEXT: vaddpd (%rdi), %ymm0, %ymm0
+; AVX-NEXT: vaddpd 32(%rdi), %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vaddpd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1
+ %2 = fadd <8 x double> %arg, %1
+ ret <8 x double> %2
+}
+
+define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
+; SSE-LABEL: test_arg_v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: paddq (%rdi), %xmm0
+; SSE-NEXT: paddq 16(%rdi), %xmm1
+; SSE-NEXT: paddq 32(%rdi), %xmm2
+; SSE-NEXT: paddq 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddq 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_arg_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1
+ %2 = add <8 x i64> %arg, %1
+ ret <8 x i64> %2
+}
+
+define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
+; SSE-LABEL: test_arg_v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddw (%rdi), %xmm0
+; SSE-NEXT: paddw 16(%rdi), %xmm1
+; SSE-NEXT: paddw 32(%rdi), %xmm2
+; SSE-NEXT: paddw 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddw 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_arg_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_arg_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_arg_v32i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpaddw (%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddw 32(%rdi), %ymm1, %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
+ %2 = add <32 x i16> %arg, %1
+ ret <32 x i16> %2
+}
+
+define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
+; SSE-LABEL: test_arg_v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddb (%rdi), %xmm0
+; SSE-NEXT: paddb 16(%rdi), %xmm1
+; SSE-NEXT: paddb 32(%rdi), %xmm2
+; SSE-NEXT: paddb 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_arg_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm2
+; AVX1-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_arg_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb 32(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_arg_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rdi), %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_arg_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_arg_v64i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpaddb (%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb 32(%rdi), %ymm1, %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
+ %2 = add <64 x i8> %arg, %1
+ ret <64 x i8> %2
+}
+
+
+; Unaligned non-temporal loads (not supported)
+
+define <4 x float> @test_unaligned_v4f32(<4 x float>* %src) {
+; SSE-LABEL: test_unaligned_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v4f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovups (%rdi), %xmm0
+; AVX512-NEXT: retq
+ %1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1
+ ret <4 x float> %1
+}
+
+define <4 x i32> @test_unaligned_v4i32(<4 x i32>* %src) {
+; SSE-LABEL: test_unaligned_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v4i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v4i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v4i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu32 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1
+ ret <4 x i32> %1
+}
+
+define <2 x double> @test_unaligned_v2f64(<2 x double>* %src) {
+; SSE-LABEL: test_unaligned_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v2f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v2f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v2f64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovupd (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1
+ ret <2 x double> %1
+}
+
+define <2 x i64> @test_unaligned_v2i64(<2 x i64>* %src) {
+; SSE-LABEL: test_unaligned_v2i64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v2i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v2i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1
+ ret <2 x i64> %1
+}
+
+define <8 x i16> @test_unaligned_v8i16(<8 x i16>* %src) {
+; SSE-LABEL: test_unaligned_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v8i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v8i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v8i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @test_unaligned_v16i8(<16 x i8>* %src) {
+; SSE-LABEL: test_unaligned_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v16i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %xmm0
+; AVX512VL-NEXT: retq
+ %1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1
+ ret <16 x i8> %1
+}
+
+; And now YMM versions.
+
+define <8 x float> @test_unaligned_v8f32(<8 x float>* %src) {
+; SSE-LABEL: test_unaligned_v8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovups (%rdi), %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1
+ ret <8 x float> %1
+}
+
+define <8 x i32> @test_unaligned_v8i32(<8 x i32>* %src) {
+; SSE-LABEL: test_unaligned_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v8i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v8i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v8i32:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu32 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1
+ ret <8 x i32> %1
+}
+
+define <4 x double> @test_unaligned_v4f64(<4 x double>* %src) {
+; SSE-LABEL: test_unaligned_v4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v4f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v4f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v4f64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovupd (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1
+ ret <4 x double> %1
+}
+
+define <4 x i64> @test_unaligned_v4i64(<4 x i64>* %src) {
+; SSE-LABEL: test_unaligned_v4i64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v4i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v4i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v4i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1
+ ret <4 x i64> %1
+}
+
+define <16 x i16> @test_unaligned_v16i16(<16 x i16>* %src) {
+; SSE-LABEL: test_unaligned_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v16i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v16i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1
+ ret <16 x i16> %1
+}
+
+define <32 x i8> @test_unaligned_v32i8(<32 x i8>* %src) {
+; SSE-LABEL: test_unaligned_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v32i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v32i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v32i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovups (%rdi), %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1
+ ret <32 x i8> %1
+}
+
+; And now ZMM versions.
+
+define <16 x float> @test_unaligned_v16f32(<16 x float>* %src) {
+; SSE-LABEL: test_unaligned_v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovups (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1
+ ret <16 x float> %1
+}
+
+define <16 x i32> @test_unaligned_v16i32(<16 x i32>* %src) {
+; SSE-LABEL: test_unaligned_v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1
+ ret <16 x i32> %1
+}
+
+define <8 x double> @test_unaligned_v8f64(<8 x double>* %src) {
+; SSE-LABEL: test_unaligned_v8f64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovupd (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1
+ ret <8 x double> %1
+}
+
+define <8 x i64> @test_unaligned_v8i64(<8 x i64>* %src) {
+; SSE-LABEL: test_unaligned_v8i64:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_unaligned_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1
+ ret <8 x i64> %1
+}
+
+define <32 x i16> @test_unaligned_v32i16(<32 x i16>* %src) {
+; SSE-LABEL: test_unaligned_v32i16:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v32i16:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v32i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqu64 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1
+ ret <32 x i16> %1
+}
+
+define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
+; SSE-LABEL: test_unaligned_v64i8:
+; SSE: # BB#0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: movups 16(%rdi), %xmm1
+; SSE-NEXT: movups 32(%rdi), %xmm2
+; SSE-NEXT: movups 48(%rdi), %xmm3
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_unaligned_v64i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX-NEXT: vmovups 32(%rdi), %ymm1
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: test_unaligned_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_unaligned_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VL-LABEL: test_unaligned_v64i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqu64 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+ %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1
+ ret <64 x i8> %1
+}
+
+!1 = !{i32 1}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
index 9a2f23596f79..33d5caba597c 100644
--- a/test/CodeGen/X86/nontemporal.ll
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -1,24 +1,135 @@
-; RUN: llc < %s -mtriple x86_64-unknown-unknown | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
-define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, i64 %F) {
-; CHECK: movntps
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 x i32> %F, <8 x i16> %G, <16 x i8> %H, i64 %I) nounwind {
+; X32-SSE-LABEL: f:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pushl %ebp
+; X32-SSE-NEXT: movl %esp, %ebp
+; X32-SSE-NEXT: pushl %esi
+; X32-SSE-NEXT: andl $-16, %esp
+; X32-SSE-NEXT: subl $16, %esp
+; X32-SSE-NEXT: movl 72(%ebp), %eax
+; X32-SSE-NEXT: movl 76(%ebp), %ecx
+; X32-SSE-NEXT: movl 12(%ebp), %edx
+; X32-SSE-NEXT: movdqa 56(%ebp), %xmm3
+; X32-SSE-NEXT: movdqa 40(%ebp), %xmm4
+; X32-SSE-NEXT: movdqa 24(%ebp), %xmm5
+; X32-SSE-NEXT: movl 8(%ebp), %esi
+; X32-SSE-NEXT: addps .LCPI0_0, %xmm0
+; X32-SSE-NEXT: movntps %xmm0, (%esi)
+; X32-SSE-NEXT: paddq .LCPI0_1, %xmm2
+; X32-SSE-NEXT: movntdq %xmm2, (%esi)
+; X32-SSE-NEXT: addpd .LCPI0_2, %xmm1
+; X32-SSE-NEXT: movntpd %xmm1, (%esi)
+; X32-SSE-NEXT: paddd .LCPI0_3, %xmm5
+; X32-SSE-NEXT: movntdq %xmm5, (%esi)
+; X32-SSE-NEXT: paddw .LCPI0_4, %xmm4
+; X32-SSE-NEXT: movntdq %xmm4, (%esi)
+; X32-SSE-NEXT: paddb .LCPI0_5, %xmm3
+; X32-SSE-NEXT: movntdq %xmm3, (%esi)
+; X32-SSE-NEXT: movntil %edx, (%esi)
+; X32-SSE-NEXT: movntil %ecx, 4(%esi)
+; X32-SSE-NEXT: movntil %eax, (%esi)
+; X32-SSE-NEXT: leal -4(%ebp), %esp
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: popl %ebp
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: f:
+; X32-AVX: # BB#0:
+; X32-AVX-NEXT: pushl %ebp
+; X32-AVX-NEXT: movl %esp, %ebp
+; X32-AVX-NEXT: pushl %esi
+; X32-AVX-NEXT: andl $-16, %esp
+; X32-AVX-NEXT: subl $16, %esp
+; X32-AVX-NEXT: movl 72(%ebp), %eax
+; X32-AVX-NEXT: movl 76(%ebp), %ecx
+; X32-AVX-NEXT: movl 12(%ebp), %edx
+; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm3
+; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm4
+; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm5
+; X32-AVX-NEXT: movl 8(%ebp), %esi
+; X32-AVX-NEXT: vaddps .LCPI0_0, %xmm0, %xmm0
+; X32-AVX-NEXT: vmovntps %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddq .LCPI0_1, %xmm2, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: vaddpd .LCPI0_2, %xmm1, %xmm0
+; X32-AVX-NEXT: vmovntpd %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddd .LCPI0_3, %xmm5, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddw .LCPI0_4, %xmm4, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: vpaddb .LCPI0_5, %xmm3, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
+; X32-AVX-NEXT: movntil %edx, (%esi)
+; X32-AVX-NEXT: movntil %ecx, 4(%esi)
+; X32-AVX-NEXT: movntil %eax, (%esi)
+; X32-AVX-NEXT: leal -4(%ebp), %esp
+; X32-AVX-NEXT: popl %esi
+; X32-AVX-NEXT: popl %ebp
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: f:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: addps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: movntps %xmm0, (%rdi)
+; X64-SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; X64-SSE-NEXT: movntdq %xmm2, (%rdi)
+; X64-SSE-NEXT: addpd {{.*}}(%rip), %xmm1
+; X64-SSE-NEXT: movntpd %xmm1, (%rdi)
+; X64-SSE-NEXT: paddd {{.*}}(%rip), %xmm3
+; X64-SSE-NEXT: movntdq %xmm3, (%rdi)
+; X64-SSE-NEXT: paddw {{.*}}(%rip), %xmm4
+; X64-SSE-NEXT: movntdq %xmm4, (%rdi)
+; X64-SSE-NEXT: paddb {{.*}}(%rip), %xmm5
+; X64-SSE-NEXT: movntdq %xmm5, (%rdi)
+; X64-SSE-NEXT: movntil %esi, (%rdi)
+; X64-SSE-NEXT: movntiq %rdx, (%rdi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: f:
+; X64-AVX: # BB#0:
+; X64-AVX-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovntps %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm0
+; X64-AVX-NEXT: vmovntpd %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %xmm3, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddw {{.*}}(%rip), %xmm4, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: vpaddb {{.*}}(%rip), %xmm5, %xmm0
+; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: movntil %esi, (%rdi)
+; X64-AVX-NEXT: movntiq %rdx, (%rdi)
+; X64-AVX-NEXT: retq
%cast = bitcast i8* %B to <4 x float>*
- %A2 = fadd <4 x float> %A, <float 0x0, float 0x0, float 0x0, float 0x4200000000000000>
+ %A2 = fadd <4 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0>
store <4 x float> %A2, <4 x float>* %cast, align 16, !nontemporal !0
-; CHECK: movntdq
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, <i64 1, i64 2>
store <2 x i64> %E2, <2 x i64>* %cast1, align 16, !nontemporal !0
-; CHECK: movntpd
%cast2 = bitcast i8* %B to <2 x double>*
- %C2 = fadd <2 x double> %C, <double 0x0, double 0x4200000000000000>
+ %C2 = fadd <2 x double> %C, <double 1.0, double 2.0>
store <2 x double> %C2, <2 x double>* %cast2, align 16, !nontemporal !0
-; CHECK: movntil
- %cast3 = bitcast i8* %B to i32*
- store i32 %D, i32* %cast3, align 1, !nontemporal !0
-; CHECK: movntiq
- %cast4 = bitcast i8* %B to i64*
- store i64 %F, i64* %cast4, align 1, !nontemporal !0
+ %cast3 = bitcast i8* %B to <4 x i32>*
+ %F2 = add <4 x i32> %F, <i32 1, i32 2, i32 3, i32 4>
+ store <4 x i32> %F2, <4 x i32>* %cast3, align 16, !nontemporal !0
+ %cast4 = bitcast i8* %B to <8 x i16>*
+ %G2 = add <8 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+ store <8 x i16> %G2, <8 x i16>* %cast4, align 16, !nontemporal !0
+ %cast5 = bitcast i8* %B to <16 x i8>*
+ %H2 = add <16 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+ store <16 x i8> %H2, <16 x i8>* %cast5, align 16, !nontemporal !0
+ %cast6 = bitcast i8* %B to i32*
+ store i32 %D, i32* %cast6, align 1, !nontemporal !0
+ %cast7 = bitcast i8* %B to i64*
+ store i64 %I, i64* %cast7, align 1, !nontemporal !0
ret void
}
diff --git a/test/CodeGen/X86/noreturn-call.ll b/test/CodeGen/X86/noreturn-call.ll
new file mode 100644
index 000000000000..89781816de82
--- /dev/null
+++ b/test/CodeGen/X86/noreturn-call.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+define void @test1(i32 %c) {
+; CHECK-LABEL: test1:
+entry:
+ %0 = alloca i8, i32 %c
+ %tobool = icmp eq i32 %c, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ call void @g(i8* %0)
+ ret void
+
+if.then:
+ call void @crash(i8* %0)
+ unreachable
+; CHECK: calll _crash
+; There is no need to adjust the stack after the call, since
+; the function is noreturn and that code will therefore never run.
+; CHECK-NOT: add
+; CHECK-NOT: pop
+}
+
+define void @test2(i32 %c) {
+; CHECK-LABEL: test2:
+entry:
+ %0 = alloca i8, i32 %c
+ %tobool = icmp eq i32 %c, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ call void @g(i8* %0)
+ ret void
+
+if.then:
+ call void @crash2(i8* %0)
+ unreachable
+; CHECK: calll _crash2
+; Even though _crash2 is not marked noreturn, it is in practice because
+; of the "unreachable" right after it. This happens e.g. when falling off
+; a non-void function after a call.
+; CHECK-NOT: add
+; CHECK-NOT: pop
+}
+
+declare void @crash(i8*) noreturn
+declare void @crash2(i8*)
+declare void @g(i8*)
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index e80f3fcbe58d..b331b92868f1 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -14,11 +14,10 @@ define void @f1() {
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!11, !13}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: " ", isOptimized: true, emissionKind: 0, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !9, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: " ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !9, imports: !2)
!1 = !DIFile(filename: "file.c", directory: "")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
+!4 = distinct !DISubprogram(name: "", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !1, type: !6, variables: !2)
!6 = !DISubroutineType(types: !7)
!7 = !{!8}
!8 = !DIBasicType(tag: DW_TAG_base_type, size: 32, align: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/X86/opt-ext-uses.ll b/test/CodeGen/X86/opt-ext-uses.ll
index 39e6fd0e6a59..b654a81c11cd 100644
--- a/test/CodeGen/X86/opt-ext-uses.ll
+++ b/test/CodeGen/X86/opt-ext-uses.ll
@@ -2,8 +2,8 @@
; This test should get one and only one register to register mov.
; CHECK-LABEL: t:
-; CHECK: movw
-; CHECK-NOT: movw
+; CHECK: movl
+; CHECK-NOT: mov
; CHECK: ret
define signext i16 @t() {
diff --git a/test/CodeGen/X86/or-lea.ll b/test/CodeGen/X86/or-lea.ll
index f45a639ffa2c..e65056a91c43 100644
--- a/test/CodeGen/X86/or-lea.ll
+++ b/test/CodeGen/X86/or-lea.ll
@@ -9,6 +9,8 @@
define i32 @or_shift1_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq
@@ -22,6 +24,8 @@ define i32 @or_shift1_and1(i32 %x, i32 %y) {
define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1_swapped:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq
@@ -35,6 +39,8 @@ define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
define i32 @or_shift2_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift2_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,4), %eax
; CHECK-NEXT: retq
@@ -48,6 +54,8 @@ define i32 @or_shift2_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq
@@ -61,6 +69,8 @@ define i32 @or_shift3_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and7(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and7:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: andl $7, %esi
; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq
@@ -76,6 +86,8 @@ define i32 @or_shift3_and7(i32 %x, i32 %y) {
define i32 @or_shift4_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift4_and1:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: shll $4, %edi
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi), %eax
@@ -92,6 +104,7 @@ define i32 @or_shift4_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and8(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and8:
; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; CHECK-NEXT: leal (,%rdi,8), %eax
; CHECK-NEXT: andl $8, %esi
; CHECK-NEXT: orl %esi, %eax
diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll
index e30cb4824aa7..d7f0251c1387 100644
--- a/test/CodeGen/X86/osx-private-labels.ll
+++ b/test/CodeGen/X86/osx-private-labels.ll
@@ -11,7 +11,7 @@
@private2 = private unnamed_addr constant [5 x i16] [i16 116, i16 101,
i16 115, i16 116, i16 0]
; CHECK: .section __TEXT,__ustring
-; CHECK-NEXT: .align 1
+; CHECK-NEXT: .p2align 1
; CHECK-NEXT: l_private2:
; There is no dedicated 4 byte strings on MachO.
@@ -19,60 +19,60 @@
%struct.NSConstantString = type { i32*, i32, i8*, i32 }
@private3 = private constant %struct.NSConstantString { i32* null, i32 1992, i8* null, i32 0 }, section "__DATA,__cfstring"
; CHECK: .section __DATA,__cfstring
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: L_private3:
; There is no dedicated 1 or 2 byte constant section on MachO.
@private4 = private unnamed_addr constant i32 42
; CHECK: .section __TEXT,__literal4,4byte_literals
-; CHECK-NEXT: .align 2
+; CHECK-NEXT: .p2align 2
; CHECK-NEXT: L_private4:
@private5 = private unnamed_addr constant i64 42
; CHECK: .section __TEXT,__literal8,8byte_literals
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private5:
@private6 = private unnamed_addr constant i128 42
; CHECK: .section __TEXT,__literal16,16byte_literals
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private6:
%struct._objc_class = type { i8* }
@private7 = private global %struct._objc_class* null, section "__OBJC,__cls_refs,literal_pointers,no_dead_strip"
; CHECK: .section __OBJC,__cls_refs,literal_pointers,no_dead_strip
-; CHECK: .align 3
+; CHECK: .p2align 3
; CHECK: L_private7:
@private8 = private global i32* null, section "__DATA,__nl_symbol_ptr,non_lazy_symbol_pointers"
; CHECK: .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private8:
@private9 = private global i32* null, section "__DATA,__la_symbol_ptr,lazy_symbol_pointers"
; CHECK: .section __DATA,__la_symbol_ptr,lazy_symbol_pointers
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private9:
@private10 = private global i32* null, section "__DATA,__mod_init_func,mod_init_funcs"
; CHECK: .section __DATA,__mod_init_func,mod_init_funcs
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private10:
@private11 = private global i32* null, section "__DATA,__mod_term_func,mod_term_funcs"
; CHECK: .section __DATA,__mod_term_func,mod_term_funcs
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private11:
@private12 = private global i32* null, section "__DATA,__foobar,interposing"
; CHECK: .section __DATA,__foobar,interposing
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: L_private12:
@private13 = private global i32 42, section "__DATA, __objc_classlist, regular, no_dead_strip"
; CHECK: .section __DATA,__objc_classlist,regular,no_dead_strip
-; CHECK-NEXT: .align 2
+; CHECK-NEXT: .p2align 2
; CHECK-NEXT: L_private13:
@private14 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_classname,cstring_literals"
diff --git a/test/CodeGen/X86/patchable-prologue.ll b/test/CodeGen/X86/patchable-prologue.ll
new file mode 100644
index 000000000000..c8daff33181c
--- /dev/null
+++ b/test/CodeGen/X86/patchable-prologue.ll
@@ -0,0 +1,67 @@
+; RUN: llc -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump -triple x86_64-apple-macosx -disassemble - | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
+
+declare void @callee(i64*)
+
+define void @f0() "patchable-function"="prologue-short-redirect" {
+; CHECK-LABEL: _f0:
+; CHECK-NEXT: 66 90 nop
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f0:
+
+ ret void
+}
+
+define void @f1() "patchable-function"="prologue-short-redirect" "no-frame-pointer-elim"="true" {
+; CHECK-LABEL: _f1
+; CHECK-NEXT: ff f5 pushq %rbp
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f1:
+ ret void
+}
+
+define void @f2() "patchable-function"="prologue-short-redirect" {
+; CHECK-LABEL: _f2
+; CHECK-NEXT: 48 81 ec a8 00 00 00 subq $168, %rsp
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f2:
+ %ptr = alloca i64, i32 20
+ call void @callee(i64* %ptr)
+ ret void
+}
+
+define void @f3() "patchable-function"="prologue-short-redirect" optsize {
+; CHECK-LABEL: _f3
+; CHECK-NEXT: 66 90 nop
+
+; CHECK-ALIGN: .p2align 4, 0x90
+; CHECK-ALIGN: _f3:
+ ret void
+}
+
+; This testcase happens to produce a KILL instruction at the beginning of the
+; first basic block. In this case the 2nd instruction should be turned into a
+; patchable one.
+; CHECK-LABEL: f4:
+; CHECK-NEXT: 8b 0c 37 movl (%rdi,%rsi), %ecx
+define i32 @f4(i8* %arg1, i64 %arg2, i32 %arg3) "patchable-function"="prologue-short-redirect" {
+bb:
+ %tmp10 = getelementptr i8, i8* %arg1, i64 %arg2
+ %tmp11 = bitcast i8* %tmp10 to i32*
+ %tmp12 = load i32, i32* %tmp11, align 4
+ fence acquire
+ %tmp13 = add i32 %tmp12, %arg3
+ %tmp14 = cmpxchg i32* %tmp11, i32 %tmp12, i32 %tmp13 seq_cst monotonic
+ %tmp15 = extractvalue { i32, i1 } %tmp14, 1
+ br i1 %tmp15, label %bb21, label %bb16
+
+bb16:
+ br label %bb21
+
+bb21:
+ %tmp22 = phi i32 [ %tmp12, %bb ], [ %arg3, %bb16 ]
+ ret i32 %tmp22
+}
diff --git a/test/CodeGen/X86/patchpoint-verifiable.mir b/test/CodeGen/X86/patchpoint-verifiable.mir
index 300ecaf002f2..c108473a1824 100644
--- a/test/CodeGen/X86/patchpoint-verifiable.mir
+++ b/test/CodeGen/X86/patchpoint-verifiable.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-apple-darwin -stop-after branch-folder -start-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=x86_64-apple-darwin -stop-after branch-folder -start-after branch-folder -o - %s | FileCheck %s
# This test verifies that the machine verifier won't report an error when
# verifying the PATCHPOINT instruction.
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index 441fb02a89e6..4bdfee6f81eb 100644
--- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s
+; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
; TODO: Reenable verify-machineinstrs once the if (!AXDead) // FIXME in
; X86InstrInfo::copyPhysReg() is resolved.
@@ -142,14 +142,21 @@ f:
; CHECK: cmpxchg
; CHECK: seto %al
; CHECK-NEXT: lahf
-; Save result of the first cmpxchg into D.
-; CHECK-NEXT: mov{{[lq]}} %[[AX:[er]ax]], %[[D:[re]d[xi]]]
+; Save result of the first cmpxchg into a temporary.
+; For 32-bit ISA, EDX, EAX are used by the results.
+; EAX, EBX, ECX, and EDX are used to set the arguments.
+; That leaves us EDI and ESI.
+; CHECK32-NEXT: movl %[[AX:eax]], %[[TMP:e[ds]i]]
+; For 64-bit ISA, RAX is used for both the result and argument.
+; This leaves us plenty of choices for the temporary. For now,
+; this is rdx, but any register could do.
+; CHECK64-NEXT: mov{{[lq]}} %[[AX:[er]ax]], %[[TMP:rdx]]
; CHECK: cmpxchg
; CHECK-NEXT: sete %al
; Save result of the second cmpxchg onto the stack.
; CHECK-NEXT: push{{[lq]}} %[[AX]]
; Restore result of the first cmpxchg from D, put it back in EFLAGS.
-; CHECK-NEXT: mov{{[lq]}} %[[D]], %[[AX]]
+; CHECK-NEXT: mov{{[lq]}} %[[TMP]], %[[AX]]
; CHECK-NEXT: addb $127, %al
; CHECK-NEXT: sahf
; Restore result of the second cmpxchg from the stack.
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
index 17e7e1dfdcf7..44ad05ec6ed7 100644
--- a/test/CodeGen/X86/phaddsub.ll
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -1,168 +1,225 @@
-; RUN: llc < %s -march=x86-64 -mattr=+ssse3,-avx | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -march=x86-64 -mattr=-ssse3,+avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phaddw1:
-; SSSE3-NOT: vphaddw
-; SSSE3: phaddw
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddw1:
-; AVX: vphaddw
-define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%r = add <8 x i16> %a, %b
ret <8 x i16> %r
}
+define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phaddw2:
-; SSSE3-NOT: vphaddw
-; SSSE3: phaddw
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddw2:
-; AVX: vphaddw
-define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
%b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
%r = add <8 x i16> %a, %b
ret <8 x i16> %r
}
+define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phaddd1:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd1:
-; AVX: vphaddd
-define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phaddd2:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd2:
-; AVX: vphaddd
-define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
%b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd3(<4 x i32> %x) {
; SSSE3-LABEL: phaddd3:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd3:
-; AVX: vphaddd
-define <4 x i32> @phaddd3(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd4(<4 x i32> %x) {
; SSSE3-LABEL: phaddd4:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd4:
-; AVX: vphaddd
-define <4 x i32> @phaddd4(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd5(<4 x i32> %x) {
; SSSE3-LABEL: phaddd5:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd5:
-; AVX: vphaddd
-define <4 x i32> @phaddd5(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd6(<4 x i32> %x) {
; SSSE3-LABEL: phaddd6:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd6:
-; AVX: vphaddd
-define <4 x i32> @phaddd6(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phaddd7(<4 x i32> %x) {
; SSSE3-LABEL: phaddd7:
-; SSSE3-NOT: vphaddd
-; SSSE3: phaddd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phaddd7:
-; AVX: vphaddd
-define <4 x i32> @phaddd7(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
%r = add <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phsubw1:
-; SSSE3-NOT: vphsubw
-; SSSE3: phsubw
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubw1:
-; AVX: vphsubw
-define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%r = sub <8 x i16> %a, %b
ret <8 x i16> %r
}
+define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phsubd1:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd1:
-; AVX: vphsubd
-define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
%r = sub <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phsubd2(<4 x i32> %x) {
; SSSE3-LABEL: phsubd2:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd2:
-; AVX: vphsubd
-define <4 x i32> @phsubd2(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
%r = sub <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phsubd3(<4 x i32> %x) {
; SSSE3-LABEL: phsubd3:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd3:
-; AVX: vphsubd
-define <4 x i32> @phsubd3(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%r = sub <4 x i32> %a, %b
ret <4 x i32> %r
}
+define <4 x i32> @phsubd4(<4 x i32> %x) {
; SSSE3-LABEL: phsubd4:
-; SSSE3-NOT: vphsubd
-; SSSE3: phsubd
+; SSSE3: # BB#0:
+; SSSE3-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
; AVX-LABEL: phsubd4:
-; AVX: vphsubd
-define <4 x i32> @phsubd4(<4 x i32> %x) {
+; AVX: # BB#0:
+; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%r = sub <4 x i32> %a, %b
diff --git a/test/CodeGen/X86/phi-immediate-factoring.ll b/test/CodeGen/X86/phi-immediate-factoring.ll
index 6425ef0e8376..05a0bf68657b 100644
--- a/test/CodeGen/X86/phi-immediate-factoring.ll
+++ b/test/CodeGen/X86/phi-immediate-factoring.ll
@@ -1,5 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
+; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
+; RUN: llc < %s -disable-preheader-prot=false -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
; PR1296
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index a0adba0f8338..8b370d93afdb 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
@.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-2.ll b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
index 8ee97ae07e65..a02a4ae15c32 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-2.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
@@ -13,7 +13,7 @@ forcond.preheader: ; preds = %entry
ifthen: ; preds = %entry
ret i32 0
-; CHECK: forbody
+; CHECK: forbody{{$}}
; CHECK-NOT: mov
forbody: ; preds = %forbody, %forcond.preheader
%indvar = phi i32 [ 0, %forcond.preheader ], [ %divisor.02, %forbody ] ; <i32> [#uses=3]
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index 73be234db81c..f03dc3f4a285 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -192,7 +192,7 @@ bb12:
; LINUX: .LJTI7_0@GOTOFF(
; LINUX: jmpl *
-; LINUX: .align 4
+; LINUX: .p2align 2
; LINUX-NEXT: .LJTI7_0:
; LINUX: .long .LBB7_2@GOTOFF
; LINUX: .long .LBB7_8@GOTOFF
diff --git a/test/CodeGen/X86/pic_jumptable.ll b/test/CodeGen/X86/pic_jumptable.ll
index 8c1992a24ece..444f98ef83de 100644
--- a/test/CodeGen/X86/pic_jumptable.ll
+++ b/test/CodeGen/X86/pic_jumptable.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -relocation-model=pic -mark-data-regions -mtriple=i686-apple-darwin -asm-verbose=false \
; RUN: | FileCheck %s --check-prefix=CHECK-DATA
; RUN: llc < %s -relocation-model=pic -mtriple=i686-apple-darwin -asm-verbose=false \
-; RUN: | FileCheck %s
+; RUN: | FileCheck %s --check-prefix=CHECK-DATA
; RUN: llc < %s -mtriple=x86_64-apple-darwin | not grep 'lJTI'
; rdar://6971437
; rdar://7738756
diff --git a/test/CodeGen/X86/pie.ll b/test/CodeGen/X86/pie.ll
new file mode 100644
index 000000000000..7b765f8ef54a
--- /dev/null
+++ b/test/CodeGen/X86/pie.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=i686-linux-gnu -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=i686-linux-gnu -fast-isel -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -O0 -mcpu=generic -mtriple=x86_64-linux-gnu -fast-isel -relocation-model=pic | FileCheck %s
+
+; CHECK-LABEL: bar:
+; CHECK: call{{l|q}} foo{{$}}
+; CHECK: call{{l|q}} weak_odr_foo{{$}}
+; CHECK: call{{l|q}} weak_foo{{$}}
+; CHECK: call{{l|q}} internal_foo{{$}}
+; CHECK: call{{l|q}} ext_baz@PLT
+
+define weak void @weak_foo() {
+ ret void
+}
+
+define weak_odr void @weak_odr_foo() {
+ ret void
+}
+
+define internal void @internal_foo() {
+ ret void
+}
+
+declare i32 @ext_baz()
+
+define void @foo() {
+ ret void
+}
+
+define void @bar() {
+entry:
+ call void @foo()
+ call void @weak_odr_foo()
+ call void @weak_foo()
+ call void @internal_foo()
+ call i32 @ext_baz()
+ ret void
+}
+
+; -fpie for local global data tests should be added here
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/pku.ll b/test/CodeGen/X86/pku.ll
index 8568cf43abc0..79b8c474ade0 100644
--- a/test/CodeGen/X86/pku.ll
+++ b/test/CodeGen/X86/pku.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding -verify-machineinstrs | FileCheck %s
declare i32 @llvm.x86.rdpkru()
declare void @llvm.x86.wrpkru(i32)
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 37b6fdf7cfeb..5f2c88d670ac 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,10 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
-define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
-; SSE2-LABEL: mul8c:
+define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
+; SSE2-LABEL: mul_v16i8c:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE2-NEXT: psraw $8, %xmm1
@@ -21,7 +23,7 @@ define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: mul8c:
+; SSE41-LABEL: mul_v16i8c:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2
@@ -36,7 +38,7 @@ define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul8c:
+; AVX2-LABEL: mul_v16i8c:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
@@ -48,28 +50,46 @@ define <16 x i8> @mul8c(<16 x i8> %i) nounwind {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v16i8c:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v16i8c:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbw {{.*}}(%rip), %ymm1
+; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
ret <16 x i8> %A
}
-define <8 x i16> @mul16c(<8 x i16> %i) nounwind {
-; SSE-LABEL: mul16c:
+define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind {
+; SSE-LABEL: mul_v8i16c:
; SSE: # BB#0: # %entry
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: mul16c:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v8i16c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
ret <8 x i16> %A
}
-define <4 x i32> @a(<4 x i32> %i) nounwind {
-; SSE2-LABEL: a:
+define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind {
+; SSE2-LABEL: mul_v4i32c:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
@@ -80,23 +100,23 @@ define <4 x i32> @a(<4 x i32> %i) nounwind {
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; SSE41-LABEL: a:
+; SSE41-LABEL: mul_v4i32c:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: a:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i32c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
ret <4 x i32> %A
}
-define <2 x i64> @b(<2 x i64> %i) nounwind {
-; SSE-LABEL: b:
+define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind {
+; SSE-LABEL: mul_v2i64c:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -107,22 +127,22 @@ define <2 x i64> @b(<2 x i64> %i) nounwind {
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: b:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117]
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v2i64c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117]
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <2 x i64> %i, < i64 117, i64 117 >
ret <2 x i64> %A
}
-define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
-; SSE2-LABEL: mul8:
+define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
+; SSE2-LABEL: mul_v16i8:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -142,7 +162,7 @@ define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: retq
;
-; SSE41-LABEL: mul8:
+; SSE41-LABEL: mul_v16i8:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmovsxbw %xmm1, %xmm3
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
@@ -159,7 +179,7 @@ define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: mul8:
+; AVX2-LABEL: mul_v16i8:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
@@ -171,28 +191,46 @@ define <16 x i8> @mul8(<16 x i8> %i, <16 x i8> %j) nounwind {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v16i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v16i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
entry:
%A = mul <16 x i8> %i, %j
ret <16 x i8> %A
}
-define <8 x i16> @mul16(<8 x i16> %i, <8 x i16> %j) nounwind {
-; SSE-LABEL: mul16:
+define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind {
+; SSE-LABEL: mul_v8i16:
; SSE: # BB#0: # %entry
; SSE-NEXT: pmullw %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: mul16:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v8i16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <8 x i16> %i, %j
ret <8 x i16> %A
}
-define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind {
-; SSE2-LABEL: c:
+define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind {
+; SSE2-LABEL: mul_v4i32:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
@@ -203,22 +241,22 @@ define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
-; SSE41-LABEL: c:
+; SSE41-LABEL: mul_v4i32:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: c:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i32> %i, %j
ret <4 x i32> %A
}
-define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind {
-; SSE-LABEL: d:
+define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind {
+; SSE-LABEL: mul_v2i64:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -226,25 +264,25 @@ define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm3
; SSE-NEXT: pmuludq %xmm0, %xmm3
; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: paddq %xmm3, %xmm2
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm1, %xmm0
; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm0
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX2-LABEL: d:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v2i64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
entry:
%A = mul <2 x i64> %i, %j
ret <2 x i64> %A
@@ -252,8 +290,8 @@ entry:
declare void @foo()
-define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
-; SSE2-LABEL: e:
+define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind {
+; SSE2-LABEL: mul_v4i32spill:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: subq $40, %rsp
; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
@@ -271,7 +309,7 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-NEXT: addq $40, %rsp
; SSE2-NEXT: retq
;
-; SSE41-LABEL: e:
+; SSE41-LABEL: mul_v4i32spill:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: subq $40, %rsp
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
@@ -282,16 +320,16 @@ define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE41-NEXT: addq $40, %rsp
; SSE41-NEXT: retq
;
-; AVX2-LABEL: e:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: subq $40, %rsp
-; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: callq foo
-; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX2-NEXT: vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT: addq $40, %rsp
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i32spill:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: callq foo
+; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: retq
entry:
; Use a call to force spills.
call void @foo()
@@ -299,8 +337,8 @@ entry:
ret <4 x i32> %A
}
-define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind {
-; SSE-LABEL: f:
+define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind {
+; SSE-LABEL: mul_v2i64spill:
; SSE: # BB#0: # %entry
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
@@ -314,33 +352,55 @@ define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm1
; SSE-NEXT: pmuludq %xmm0, %xmm1
; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm1, %xmm2
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm3, %xmm0
; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: addq $40, %rsp
; SSE-NEXT: retq
;
-; AVX2-LABEL: f:
+; AVX2-LABEL: mul_v2i64spill:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: subq $40, %rsp
; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX2-NEXT: callq foo
; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm0
+; AVX2-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm1
-; AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlq $32, %xmm3, %xmm1
-; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa %xmm2, %xmm3
+; AVX2-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlq $32, %xmm4, %xmm2
+; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: addq $40, %rsp
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v2i64spill:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: callq foo
+; AVX512-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX512-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
+; AVX512-NEXT: vpsrlq $32, %xmm2, %xmm1
+; AVX512-NEXT: vmovaps %zmm2, %zmm3
+; AVX512-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
+; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT: vpsrlq $32, %xmm4, %xmm2
+; AVX512-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: retq
entry:
; Use a call to force spills.
call void @foo()
@@ -348,8 +408,160 @@ entry:
ret <2 x i64> %A
}
-define <4 x i64> @b1(<4 x i64> %i) nounwind {
-; SSE-LABEL: b1:
+define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
+; SSE2-LABEL: mul_v32i8c:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm4, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v32i8c:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm4
+; SSE41-NEXT: pmullw %xmm4, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pmullw %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm2
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm3
+; SSE41-NEXT: pmullw %xmm4, %xmm3
+; SSE41-NEXT: pand %xmm5, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pmullw %xmm4, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v32i8c:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v32i8c:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v32i8c:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
+ ret <32 x i8> %A
+}
+
+define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind {
+; SSE-LABEL: mul_v16i16c:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mul_v16i16c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 >
+ ret <16 x i16> %A
+}
+
+define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind {
+; SSE2-LABEL: mul_v8i32c:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v8i32c:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117]
+; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: pmulld %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mul_v8i32c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 >
+ ret <8 x i32> %A
+}
+
+define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind {
+; SSE-LABEL: mul_v4i64c:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm3
@@ -366,22 +578,188 @@ define <4 x i64> @b1(<4 x i64> %i) nounwind {
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: retq
;
-; AVX2-LABEL: b1:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i64c:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 >
ret <4 x i64> %A
}
-define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind {
-; SSE-LABEL: b2:
+define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
+; SSE2-LABEL: mul_v32i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: packuswb %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v32i8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm5
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm4
+; SSE41-NEXT: pmullw %xmm5, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pmullw %xmm2, %xmm0
+; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm0
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm5, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v32i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v32i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v32i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <32 x i8> %i, %j
+ ret <32 x i8> %A
+}
+
+define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind {
+; SSE-LABEL: mul_v16i16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: mul_v16i16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <16 x i16> %i, %j
+ ret <16 x i16> %A
+}
+
+define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind {
+; SSE2-LABEL: mul_v8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmulld %xmm2, %xmm0
+; SSE41-NEXT: pmulld %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mul_v8i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+entry:
+ %A = mul <8 x i32> %i, %j
+ ret <8 x i32> %A
+}
+
+define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind {
+; SSE-LABEL: mul_v4i64:
; SSE: # BB#0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: pmuludq %xmm2, %xmm4
@@ -389,10 +767,10 @@ define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm5
; SSE-NEXT: pmuludq %xmm0, %xmm5
; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: paddq %xmm5, %xmm4
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
; SSE-NEXT: paddq %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pmuludq %xmm3, %xmm2
@@ -400,27 +778,401 @@ define <4 x i64> @b2(<4 x i64> %i, <4 x i64> %j) nounwind {
; SSE-NEXT: psrlq $32, %xmm4
; SSE-NEXT: pmuludq %xmm1, %xmm4
; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: paddq %xmm4, %xmm2
; SSE-NEXT: psrlq $32, %xmm1
; SSE-NEXT: pmuludq %xmm3, %xmm1
; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm4, %xmm1
; SSE-NEXT: paddq %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX2-LABEL: b2:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: retq
+; AVX-LABEL: mul_v4i64:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT: retq
entry:
%A = mul <4 x i64> %i, %j
ret <4 x i64> %A
}
+define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
+; SSE2-LABEL: mul_v64i8c:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: packuswb %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: packuswb %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: pmullw %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v64i8c:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm6
+; SSE41-NEXT: pmullw %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm7, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm7, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm1
+; SSE41-NEXT: pmullw %xmm6, %xmm1
+; SSE41-NEXT: pand %xmm7, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
+; SSE41-NEXT: pmullw %xmm6, %xmm4
+; SSE41-NEXT: pand %xmm7, %xmm4
+; SSE41-NEXT: packuswb %xmm4, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm4
+; SSE41-NEXT: pmullw %xmm6, %xmm4
+; SSE41-NEXT: pand %xmm7, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm7, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm5
+; SSE41-NEXT: pmullw %xmm6, %xmm5
+; SSE41-NEXT: pand %xmm7, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm6, %xmm2
+; SSE41-NEXT: pand %xmm7, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v64i8c:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v64i8c:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v64i8c:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vmovaps {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
+ ret <64 x i8> %A
+}
+
+define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
+; SSE2-LABEL: mul_v64i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm9
+; SSE2-NEXT: pmullw %xmm8, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm8, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm0
+; SSE2-NEXT: pmullw %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: packuswb %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: pmullw %xmm9, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm5
+; SSE2-NEXT: pmullw %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm7
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: packuswb %xmm5, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: mul_v64i8:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm9
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT: pmullw %xmm9, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm9, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm4, %xmm1
+; SSE41-NEXT: pand %xmm9, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
+; SSE41-NEXT: pmovsxbw %xmm5, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm8, %xmm1
+; SSE41-NEXT: pmullw %xmm4, %xmm1
+; SSE41-NEXT: pand %xmm9, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm4, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm5, %xmm5
+; SSE41-NEXT: pmullw %xmm4, %xmm5
+; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: packuswb %xmm5, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm6, %xmm5
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm4
+; SSE41-NEXT: pmullw %xmm5, %xmm4
+; SSE41-NEXT: pand %xmm9, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm5, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm5, %xmm2
+; SSE41-NEXT: pand %xmm9, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm4
+; SSE41-NEXT: pmovsxbw %xmm7, %xmm2
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm5
+; SSE41-NEXT: pmullw %xmm2, %xmm5
+; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pand %xmm9, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm5, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: mul_v64i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX2-NEXT: vpmullw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: mul_v64i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: mul_v64i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+entry:
+ %A = mul <64 x i8> %i, %j
+ ret <64 x i8> %A
+}
+
diff --git a/test/CodeGen/X86/pop-stack-cleanup.ll b/test/CodeGen/X86/pop-stack-cleanup.ll
index bcf7594065f3..f81d911ea31b 100644
--- a/test/CodeGen/X86/pop-stack-cleanup.ll
+++ b/test/CodeGen/X86/pop-stack-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=LINUX64
declare void @param1(i32 %a)
@@ -7,6 +7,7 @@ declare i64 @param2_ret64(i32 %a, i32 %b)
declare void @param2(i32 %a, i32 %b)
declare void @param3(i32 %a, i32 %b, i32 %c)
declare void @param8(i64, i64, i64, i64, i64, i64, i64, i64)
+declare i32 @param8_ret(i64, i64, i64, i64, i64, i64, i64, i64)
define void @test() minsize nounwind {
@@ -74,3 +75,13 @@ define void @test_linux64(i32 %size) minsize nounwind {
call void @param8(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8)
ret void
}
+
+define i32 @test_linux64_i32(i32 %size) minsize nounwind {
+; LINUX64-LABEL: test_linux64_i32:
+; LINUX64: callq param8_ret
+; LINUX64-NOT: popq %rax
+; LINUX64: retq
+ %a = alloca i64, i32 %size, align 8
+ %r = call i32 @param8_ret(i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8)
+ ret i32 %r
+}
diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll
index e9350de101f6..b5d4ebba0538 100644
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@@ -1,35 +1,252 @@
-; RUN: llc -march=x86-64 -mattr=+popcnt < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT
define i8 @cnt8(i8 %x) nounwind readnone {
+; X32-LABEL: cnt8:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb %al
+; X32-NEXT: andb $85, %al
+; X32-NEXT: subb %al, %cl
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $51, %al
+; X32-NEXT: shrb $2, %cl
+; X32-NEXT: andb $51, %cl
+; X32-NEXT: addb %al, %cl
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $4, %al
+; X32-NEXT: addb %cl, %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt8:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrb %al
+; X64-NEXT: andb $85, %al
+; X64-NEXT: subb %al, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andb $51, %al
+; X64-NEXT: shrb $2, %dil
+; X64-NEXT: andb $51, %dil
+; X64-NEXT: addb %al, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrb $4, %al
+; X64-NEXT: addb %dil, %al
+; X64-NEXT: andb $15, %al
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt8:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT: popcntw %ax, %ax
+; X32-POPCNT-NEXT: # kill: %AL<def> %AL<kill> %AX<kill>
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt8:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: movzbl %dil, %eax
+; X64-POPCNT-NEXT: popcntw %ax, %ax
+; X64-POPCNT-NEXT: # kill: %AL<def> %AL<kill> %AX<kill>
+; X64-POPCNT-NEXT: retq
%cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
ret i8 %cnt
-; CHECK-LABEL: cnt8:
-; CHECK: popcntw
-; CHECK: ret
}
define i16 @cnt16(i16 %x) nounwind readnone {
+; X32-LABEL: cnt16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl %ecx
+; X32-NEXT: andl $21845, %ecx # imm = 0x5555
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $13107, %ecx # imm = 0x3333
+; X32-NEXT: shrl $2, %eax
+; X32-NEXT: andl $13107, %eax # imm = 0x3333
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $32752, %ecx # imm = 0x7FF0
+; X32-NEXT: shrl $4, %ecx
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: andl $3855, %ecx # imm = 0xF0F
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shll $8, %eax
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %ah, %eax
+; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt16:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: andl $21845, %eax # imm = 0x5555
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $13107, %eax # imm = 0x3333
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $13107, %edi # imm = 0x3333
+; X64-NEXT: addl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $32752, %eax # imm = 0x7FF0
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: addl %edi, %eax
+; X64-NEXT: andl $3855, %eax # imm = 0xF0F
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: shll $8, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: movzbl %ch, %eax # NOREX
+; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt16:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: popcntw {{[0-9]+}}(%esp), %ax
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt16:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: popcntw %di, %ax
+; X64-POPCNT-NEXT: retq
%cnt = tail call i16 @llvm.ctpop.i16(i16 %x)
ret i16 %cnt
-; CHECK-LABEL: cnt16:
-; CHECK: popcntw
-; CHECK: ret
}
define i32 @cnt32(i32 %x) nounwind readnone {
+; X32-LABEL: cnt32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl %ecx
+; X32-NEXT: andl $1431655765, %ecx # imm = 0x55555555
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X32-NEXT: shrl $2, %eax
+; X32-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $4, %ecx
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
+; X32-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101
+; X32-NEXT: shrl $24, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl %eax
+; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT: subl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT: shrl $2, %edi
+; X64-NEXT: andl $858993459, %edi # imm = 0x33333333
+; X64-NEXT: addl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl $4, %eax
+; X64-NEXT: addl %edi, %eax
+; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101
+; X64-NEXT: shrl $24, %eax
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt32:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt32:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: popcntl %edi, %eax
+; X64-POPCNT-NEXT: retq
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
ret i32 %cnt
-; CHECK-LABEL: cnt32:
-; CHECK: popcntl
-; CHECK: ret
}
define i64 @cnt64(i64 %x) nounwind readnone {
+; X32-LABEL: cnt64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl %edx
+; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555
+; X32-NEXT: subl %edx, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andl $858993459, %edx # imm = 0x33333333
+; X32-NEXT: shrl $2, %ecx
+; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $4, %edx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; X32-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrl %edx
+; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555
+; X32-NEXT: subl %edx, %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: andl $858993459, %edx # imm = 0x33333333
+; X32-NEXT: shrl $2, %eax
+; X32-NEXT: andl $858993459, %eax # imm = 0x33333333
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrl $4, %edx
+; X32-NEXT: addl %eax, %edx
+; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; X32-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
+; X32-NEXT: shrl $24, %eax
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: cnt64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq %rax
+; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: subq %rcx, %rdi
+; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT: movq %rdi, %rcx
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: shrq $2, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: addq %rcx, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: shrq $4, %rax
+; X64-NEXT: leaq (%rax,%rdi), %rax
+; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: shrq $56, %rax
+; X64-NEXT: retq
+;
+; X32-POPCNT-LABEL: cnt64:
+; X32-POPCNT: # BB#0:
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
+; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
+; X32-POPCNT-NEXT: addl %ecx, %eax
+; X32-POPCNT-NEXT: xorl %edx, %edx
+; X32-POPCNT-NEXT: retl
+;
+; X64-POPCNT-LABEL: cnt64:
+; X64-POPCNT: # BB#0:
+; X64-POPCNT-NEXT: popcntq %rdi, %rax
+; X64-POPCNT-NEXT: retq
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
ret i64 %cnt
-; CHECK-LABEL: cnt64:
-; CHECK: popcntq
-; CHECK: ret
}
declare i8 @llvm.ctpop.i8(i8) nounwind readnone
diff --git a/test/CodeGen/X86/post-ra-sched.ll b/test/CodeGen/X86/post-ra-sched.ll
new file mode 100644
index 000000000000..c31072a8a5eb
--- /dev/null
+++ b/test/CodeGen/X86/post-ra-sched.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium4 | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium4m | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=pentium-m | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=prescott | FileCheck %s
+; RUN: llc < %s -mtriple=i386 -mcpu=nocona | FileCheck %s
+;
+; Verify that scheduling puts some distance between a load feeding into
+; the address of another load, and that second load. This currently
+; happens during the post-RA-scheduler, which should be enabled by
+; default with the above specified cpus.
+
+@ptrs = external global [0 x i32*], align 4
+@idxa = common global i32 0, align 4
+@idxb = common global i32 0, align 4
+@res = common global i32 0, align 4
+
+define void @addindirect() {
+; CHECK-LABEL: addindirect:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl idxb, %ecx
+; CHECK-NEXT: movl idxa, %eax
+; CHECK-NEXT: movl ptrs(,%ecx,4), %ecx
+; CHECK-NEXT: movl ptrs(,%eax,4), %eax
+; CHECK-NEXT: movl (%ecx), %ecx
+; CHECK-NEXT: addl (%eax), %ecx
+; CHECK-NEXT: movl %ecx, res
+; CHECK-NEXT: retl
+entry:
+ %0 = load i32, i32* @idxa, align 4
+ %arrayidx = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %0
+ %1 = load i32*, i32** %arrayidx, align 4
+ %2 = load i32, i32* %1, align 4
+ %3 = load i32, i32* @idxb, align 4
+ %arrayidx1 = getelementptr inbounds [0 x i32*], [0 x i32*]* @ptrs, i32 0, i32 %3
+ %4 = load i32*, i32** %arrayidx1, align 4
+ %5 = load i32, i32* %4, align 4
+ %add = add i32 %5, %2
+ store i32 %add, i32* @res, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/postra-licm.ll b/test/CodeGen/X86/postra-licm.ll
index 5c93160125e7..329184a88ff0 100644
--- a/test/CodeGen/X86/postra-licm.ll
+++ b/test/CodeGen/X86/postra-licm.ll
@@ -70,7 +70,7 @@ bb26.preheader: ; preds = %imix_test.exit
bb23: ; preds = %imix_test.exit
unreachable
; Verify that there are no loads inside the loop.
-; X86-32: .align 4
+; X86-32: .p2align 4
; X86-32: %bb28
; X86-32-NOT: (%esp),
; X86-32-NOT: (%ebp),
@@ -152,7 +152,7 @@ entry:
bb.nph: ; preds = %entry
; X86-64: movq _map_4_to_16@GOTPCREL(%rip)
-; X86-64: .align 4
+; X86-64: .p2align 4
%tmp5 = zext i32 undef to i64 ; <i64> [#uses=1]
%tmp6 = add i64 %tmp5, 1 ; <i64> [#uses=1]
%tmp11 = shl i64 undef, 1 ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll
index 88b5f4eb14b0..fb7f570d6251 100644
--- a/test/CodeGen/X86/powi.ll
+++ b/test/CodeGen/X86/powi.ll
@@ -29,8 +29,9 @@ define double @pow_wrapper_optsize(double %a) optsize {
define double @pow_wrapper_minsize(double %a) minsize {
; CHECK-LABEL: pow_wrapper_minsize:
; CHECK: # BB#0:
-; CHECK-NEXT: movl $15, %edi
-; CHECK-NEXT: jmp
+; CHECK-NEXT: pushq $15
+; CHECK: popq %rdi
+; CHECK: jmp
%ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
ret double %ret
}
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index 9fc754aa1128..d62aaf90587d 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -7,18 +7,14 @@ define <4 x i3> @test1(<4 x i3>* %in) nounwind {
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $3, %ecx
-; CHECK-NEXT: andl $7, %ecx
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: andl $7, %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
+; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $6, %ecx
-; CHECK-NEXT: andl $7, %ecx
; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; CHECK-NEXT: shrl $9, %eax
-; CHECK-NEXT: andl $7, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%ret = load <4 x i3>, <4 x i3>* %in, align 1
ret <4 x i3> %ret
@@ -30,18 +26,14 @@ define <4 x i1> @test2(<4 x i1>* %in) nounwind {
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl %ecx
-; CHECK-NEXT: andl $1, %ecx
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: andl $1, %edx
-; CHECK-NEXT: vmovd %edx, %xmm0
+; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $2, %ecx
-; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
; CHECK-NEXT: shrl $3, %eax
-; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%ret = load <4 x i1>, <4 x i1>* %in, align 1
ret <4 x i1> %ret
diff --git a/test/CodeGen/X86/pr16360.ll b/test/CodeGen/X86/pr16360.ll
index 1f73a4d43600..0d2878dc6af0 100644
--- a/test/CodeGen/X86/pr16360.ll
+++ b/test/CodeGen/X86/pr16360.ll
@@ -1,16 +1,17 @@
-; RUN: llc < %s -mcpu=pentium4 -mtriple=i686-pc-linux | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s
define i64 @foo(i32 %sum) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: orl $-67108864, %eax # imm = 0xFC000000
+; CHECK-NEXT: movl $1073741823, %edx # imm = 0x3FFFFFFF
+; CHECK-NEXT: retl
entry:
%conv = sext i32 %sum to i64
%shr = lshr i64 %conv, 2
%or = or i64 4611686018360279040, %shr
ret i64 %or
}
-
-; CHECK: foo
-; CHECK: shrl $2
-; CHECK: orl $-67108864
-; CHECK-NOT: movl $-1
-; CHECK: movl $1073741823
-; CHECK: ret
diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
index 7a3fd6d1810b..a44248ff3f59 100644
--- a/test/CodeGen/X86/pr17764.ll
+++ b/test/CodeGen/X86/pr17764.ll
@@ -1,10 +1,16 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s
define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0
+; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
+; CHECK-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+;
%ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
ret <16 x i16> %ret
}
-; CHECK: foo
-; CHECK: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
-; CHECK: ret
diff --git a/test/CodeGen/X86/pr23664.ll b/test/CodeGen/X86/pr23664.ll
index a501c0db837e..155fc03de83b 100644
--- a/test/CodeGen/X86/pr23664.ll
+++ b/test/CodeGen/X86/pr23664.ll
@@ -9,6 +9,6 @@ define i2 @f(i32 %arg) {
; CHECK-LABEL: f:
; CHECK: addb %dil, %dil
; CHECK-NEXT: orb $1, %dil
-; CHECK-NEXT: movb %dil, %al
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
}
diff --git a/test/CodeGen/X86/pr2585.ll b/test/CodeGen/X86/pr2585.ll
new file mode 100644
index 000000000000..7796ee9a2628
--- /dev/null
+++ b/test/CodeGen/X86/pr2585.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+@0 = external constant <4 x i32> ; <<4 x i32>*>:0 [#uses=1]
+@1 = external constant <4 x i16> ; <<4 x i16>*>:1 [#uses=1]
+
+define internal void @PR2585() {
+; X32-LABEL: PR2585:
+; X32: # BB#0:
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT: movq %xmm0, __unnamed_2
+; X32-NEXT: retl
+;
+; X64-LABEL: PR2585:
+; X64: # BB#0:
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, {{.*}}(%rip)
+; X64-NEXT: retq
+ load <4 x i32>, <4 x i32>* @0, align 16 ; <<4 x i32>>:1 [#uses=1]
+ bitcast <4 x i32> %1 to <8 x i16> ; <<8 x i16>>:2 [#uses=1]
+ shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > ; <<8 x i16>>:3 [#uses=1]
+ bitcast <8 x i16> %3 to <2 x i64> ; <<2 x i64>>:4 [#uses=1]
+ extractelement <2 x i64> %4, i32 0 ; <i64>:5 [#uses=1]
+ bitcast i64 %5 to <4 x i16> ; <<4 x i16>>:6 [#uses=1]
+ store <4 x i16> %6, <4 x i16>* @1, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/pr26350.ll b/test/CodeGen/X86/pr26350.ll
new file mode 100644
index 000000000000..6e87cb3e8b7a
--- /dev/null
+++ b/test/CodeGen/X86/pr26350.ll
@@ -0,0 +1,21 @@
+; RUN: llc -disable-constant-hoisting < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@d = global i32 8, align 4
+
+define i32 @main() {
+entry:
+ %load = load i32, i32* @d, align 4
+ %conv1 = zext i32 %load to i64
+ %shl = shl i64 %conv1, 1
+ %mul = and i64 %shl, 4294967312
+ %cmp = icmp ugt i64 4294967295, %mul
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+; CHECK: main:
+; CHECK: movl d, %[[load:.*]]
+; CHECK: movl %[[load]], %[[copy:.*]]
+; CHECK: shrl $31, %[[copy]]
+; CHECK: addl %[[load]], %[[load]]
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll
index 8003588a2e84..debb13ee3e5d 100644
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -21,7 +21,7 @@ forcond.preheader: ; preds = %entry
; CHECK: je
; There should be no moves required in the for loop body.
-; CHECK: %forbody
+; CHECK: %forbody{{$}}
; CHECK-NOT: mov
; CHECK: jbe
diff --git a/test/CodeGen/X86/pr26652.ll b/test/CodeGen/X86/pr26652.ll
new file mode 100644
index 000000000000..c47128a51e9a
--- /dev/null
+++ b/test/CodeGen/X86/pr26652.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86
+; PR26652
+
+define <2 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
+entry:
+ %0 = or <4 x i32> %a, %b
+ %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ ret <2 x i32> %1
+}
diff --git a/test/CodeGen/X86/pr26757.ll b/test/CodeGen/X86/pr26757.ll
new file mode 100644
index 000000000000..96cbb783ca01
--- /dev/null
+++ b/test/CodeGen/X86/pr26757.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc"
+
+declare void @throw()
+
+define void @test1() personality i32 (...)* @__CxxFrameHandler3 {
+ %e = alloca i8, align 4
+ invoke void @throw()
+ to label %.noexc unwind label %catch.dispatch
+
+.noexc:
+ unreachable
+
+catch.object.Exception:
+ %cp = catchpad within %cs [i8* null, i32 0, i8* %e]
+ catchret from %cp to label %catchhandler
+
+catch.dispatch:
+ %cs = catchswitch within none [label %catch.object.Exception] unwind to caller
+
+catchhandler:
+ call void @use(i8* %e)
+ ret void
+}
+
+; CHECK-LABEL: $handlerMap$0$test1:
+; CHECK: .long 0
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -20
+
+declare void @use(i8*)
+
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/CodeGen/X86/pr26835.ll b/test/CodeGen/X86/pr26835.ll
new file mode 100644
index 000000000000..4fc73b885757
--- /dev/null
+++ b/test/CodeGen/X86/pr26835.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux"
+
+; CHECK-LABEL: foo
+; CHECK: div
+define i24 @foo(i24 %a, i24 %b) {
+ %r = urem i24 %a, %b
+ ret i24 %r
+}
diff --git a/test/CodeGen/X86/pr26870.ll b/test/CodeGen/X86/pr26870.ll
new file mode 100644
index 000000000000..2731ed2d0125
--- /dev/null
+++ b/test/CodeGen/X86/pr26870.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -mtriple=i686-pc-windows-msvc18.0.0 -mcpu=pentium4
+
+define x86_thiscallcc i32* @fn4(i32* %this, i8* dereferenceable(1) %p1) {
+entry:
+ %DL = getelementptr inbounds i32, i32* %this, i32 0
+ %call.i = tail call x86_thiscallcc i64 @fn1(i32* %DL)
+ %getTypeAllocSize___trans_tmp_2.i = getelementptr inbounds i32, i32* %this, i32 0
+ %0 = load i32, i32* %getTypeAllocSize___trans_tmp_2.i, align 4
+ %call.i8 = tail call x86_thiscallcc i64 @fn1(i32* %DL)
+ %1 = insertelement <2 x i64> undef, i64 %call.i, i32 0
+ %2 = insertelement <2 x i64> %1, i64 %call.i8, i32 1
+ %3 = add nsw <2 x i64> %2, <i64 7, i64 7>
+ %4 = sdiv <2 x i64> %3, <i64 8, i64 8>
+ %5 = add nsw <2 x i64> %4, <i64 1, i64 1>
+ %6 = load i32, i32* %getTypeAllocSize___trans_tmp_2.i, align 4
+ %7 = insertelement <2 x i32> undef, i32 %0, i32 0
+ %8 = insertelement <2 x i32> %7, i32 %6, i32 1
+ %9 = zext <2 x i32> %8 to <2 x i64>
+ %10 = srem <2 x i64> %5, %9
+ %11 = sub <2 x i64> %5, %10
+ %12 = trunc <2 x i64> %11 to <2 x i32>
+ %13 = extractelement <2 x i32> %12, i32 0
+ %14 = extractelement <2 x i32> %12, i32 1
+ %cmp = icmp eq i32 %13, %14
+ br i1 %cmp, label %if.then, label %cleanup
+
+if.then:
+ %call4 = tail call x86_thiscallcc i32* @fn3(i8* nonnull %p1)
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i32* [ %call4, %if.then ], [ undef, %entry ]
+ ret i32* %retval.0
+}
+
+declare x86_thiscallcc i32* @fn3(i8*)
+declare x86_thiscallcc i64 @fn1(i32*)
diff --git a/test/CodeGen/X86/pr27071.ll b/test/CodeGen/X86/pr27071.ll
new file mode 100644
index 000000000000..13608d510770
--- /dev/null
+++ b/test/CodeGen/X86/pr27071.ll
@@ -0,0 +1,29 @@
+; RUN: llc -relocation-model pic < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd"
+
+@x1 = external thread_local global i32, align 4
+
+define void @x3() #0 {
+entry:
+ %0 = load i32, i32* @x1, align 4
+ %cond = icmp eq i32 %0, 92
+ br i1 %cond, label %sw.bb, label %sw.epilog
+
+sw.bb: ; preds = %entry
+ call void @x2(i8* null)
+ unreachable
+
+sw.epilog: ; preds = %entry
+ ret void
+}
+
+declare void @x2(i8*)
+
+attributes #0 = { optsize }
+
+; CHECK-LABEL: x3:
+; CHECK: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.L0$pb), %[[REG:.*]]
+; CHECK-NEXT: leal x1@TLSGD(,%[[REG]]), %eax
+; CHECK-NEXT: calll ___tls_get_addr@PLT
+; CHECK-NEXT: cmpl $92, (%eax)
diff --git a/test/CodeGen/X86/pr27501.ll b/test/CodeGen/X86/pr27501.ll
new file mode 100644
index 000000000000..bde41214471d
--- /dev/null
+++ b/test/CodeGen/X86/pr27501.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @test1(i64* %result.repack) personality i32 (...)* @__CxxFrameHandler3 {
+bb:
+ invoke void @may_throw(i32 1)
+ to label %postinvoke unwind label %cleanuppad
+; CHECK: movl $1, %ecx
+; CHECK: callq may_throw
+
+postinvoke: ; preds = %bb
+ store i64 19, i64* %result.repack, align 8
+
+; CHECK: movq $19, (%rsi)
+; CHECK: movl $2, %ecx
+; CHECK-NEXT: movq %rsi, -8(%rbp)
+; CHECK-NEXT: callq may_throw
+ invoke void @may_throw(i32 2)
+ to label %assertFailed unwind label %catch.dispatch
+
+catch.dispatch: ; preds = %cleanuppad9, %postinvoke
+ %tmp3 = catchswitch within none [label %catch.object.Throwable] unwind label %cleanuppad
+
+catch.object.Throwable: ; preds = %catch.dispatch
+ %tmp2 = catchpad within %tmp3 [i8* null, i32 64, i8* null]
+ catchret from %tmp2 to label %catchhandler
+
+catchhandler: ; preds = %catch.object.Throwable
+ invoke void @may_throw(i32 3)
+ to label %try.success.or.caught unwind label %cleanuppad
+
+try.success.or.caught: ; preds = %catchhandler
+ invoke void @may_throw(i32 4)
+ to label %postinvoke27 unwind label %cleanuppad24
+; CHECK: movl $4, %ecx
+; CHECK-NEXT: callq may_throw
+
+postinvoke27: ; preds = %try.success.or.caught
+ store i64 42, i64* %result.repack, align 8
+; CHECK: movq -8(%rbp), %[[reload:r..]]
+; CHECK-NEXT: movq $42, (%[[reload]])
+ ret void
+
+cleanuppad24: ; preds = %try.success.or.caught
+ %tmp5 = cleanuppad within none []
+ cleanupret from %tmp5 unwind to caller
+
+cleanuppad: ; preds = %catchhandler, %catch.dispatch, %bb
+ %tmp1 = cleanuppad within none []
+ cleanupret from %tmp1 unwind to caller
+
+assertFailed: ; preds = %postinvoke
+ invoke void @may_throw(i32 5)
+ to label %postinvoke13 unwind label %cleanuppad9
+
+postinvoke13: ; preds = %assertFailed
+ unreachable
+
+cleanuppad9: ; preds = %assertFailed
+ %tmp4 = cleanuppad within none []
+ cleanupret from %tmp4 unwind label %catch.dispatch
+}
+
+declare void @may_throw(i32)
+
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/CodeGen/X86/pr27591.ll b/test/CodeGen/X86/pr27591.ll
new file mode 100644
index 000000000000..11f5de4956a4
--- /dev/null
+++ b/test/CodeGen/X86/pr27591.ll
@@ -0,0 +1,51 @@
+; RUN: llc -o - -O0 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1(i32 %x) #0 {
+; CHECK-LABEL: test1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: kmovw %ecx, %k0
+; CHECK-NEXT: kmovb %k0, %eax
+; CHECK-NEXT: andb $1, %al
+; CHECK-NEXT: movzbl %al, %edi
+; CHECK-NEXT: callq callee1
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+entry:
+ %tobool = icmp ne i32 %x, 0
+ call void @callee1(i1 zeroext %tobool)
+ ret void
+}
+
+define void @test2(i32 %x) #0 {
+; CHECK-LABEL: test2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: kmovw %ecx, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: movb %cl, %al
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: movl $-1, %edx
+; CHECK-NEXT: cmovnel %edx, %edi
+; CHECK-NEXT: callq callee2
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+entry:
+ %tobool = icmp ne i32 %x, 0
+ call void @callee2(i1 signext %tobool)
+ ret void
+}
+
+declare void @callee1(i1 zeroext)
+declare void @callee2(i1 signext)
+
+attributes #0 = { nounwind "target-cpu"="skylake-avx512" }
diff --git a/test/CodeGen/X86/pr27681.mir b/test/CodeGen/X86/pr27681.mir
new file mode 100644
index 000000000000..9473a21d7327
--- /dev/null
+++ b/test/CodeGen/X86/pr27681.mir
@@ -0,0 +1,87 @@
+# RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=slm -run-pass post-RA-sched -o - %s | FileCheck %s
+#
+# Verify that the critical antidependence breaker does not consider
+# a high byte register as available as a replacement register
+# in a certain context.
+--- |
+
+ define void @main() { ret void }
+
+...
+---
+# CHECK-LABEL: main
+name: main
+allVRegsAllocated: true
+tracksRegLiveness: true
+frameInfo:
+ stackSize: 52
+fixedStack:
+ - { id: 0, type: spill-slot, offset: -20, size: 4, alignment: 4, callee-saved-register: '%esi' }
+ - { id: 1, type: spill-slot, offset: -16, size: 4, alignment: 4, callee-saved-register: '%edi' }
+ - { id: 2, type: spill-slot, offset: -12, size: 4, alignment: 4, callee-saved-register: '%ebx' }
+ - { id: 3, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '%ebp' }
+stack:
+ - { id: 0, type: spill-slot, offset: -53, size: 1, alignment: 1 }
+ - { id: 1, type: spill-slot, offset: -48, size: 4, alignment: 4 }
+ - { id: 2, type: spill-slot, offset: -32, size: 4, alignment: 4 }
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: %ebp, %ebx, %edi, %esi
+
+ frame-setup PUSH32r killed %ebp, implicit-def %esp, implicit %esp
+ frame-setup PUSH32r killed %ebx, implicit-def %esp, implicit %esp
+ frame-setup PUSH32r killed %edi, implicit-def %esp, implicit %esp
+ frame-setup PUSH32r killed %esi, implicit-def %esp, implicit %esp
+ %esp = frame-setup SUB32ri8 %esp, 36, implicit-def dead %eflags
+ %eax = MOV32ri 1
+ %ebp = MOV32ri 2
+ %ebx = MOV32ri 3
+ %ecx = MOV32ri 4
+ %edi = MOV32ri 5
+ %edx = MOV32ri 6
+
+ bb.1:
+ successors: %bb.3, %bb.2
+ liveins: %eax, %ebp, %ebx, %ecx, %edi, %edx
+
+ %ebp = SHR32rCL killed %ebp, implicit-def dead %eflags, implicit %cl
+ %ebp = XOR32rr killed %ebp, killed %ebx, implicit-def dead %eflags
+ TEST32rr %edx, %edx, implicit-def %eflags
+ %cl = SETNEr implicit %eflags
+ ; This %bl def is antidependent on the above use of %ebx
+ %bl = MOV8rm %esp, 1, _, 3, _ ; :: (load 1 from %stack.0)
+ %cl = OR8rr killed %cl, %bl, implicit-def dead %eflags
+ %esi = MOVZX32rr8 killed %cl
+ %esi = ADD32rr killed %esi, killed %edi, implicit-def dead %eflags
+ %ecx = MOV32rm %esp, 1, _, 24, _ ; :: (load 4 from %stack.2)
+ %edx = SAR32rCL killed %edx, implicit-def dead %eflags, implicit %cl
+ TEST32rr killed %edx, %edx, implicit-def %eflags
+ %cl = SETNEr implicit %eflags
+ ; Verify that removal of the %bl antidependence does not use %ch
+ ; as a replacement register.
+ ; CHECK: %cl = AND8rr %cl, killed %b
+ %cl = AND8rr killed %cl, killed %bl, implicit-def dead %eflags
+ CMP32ri8 %ebp, -1, implicit-def %eflags
+ %edx = MOV32ri 0
+ JE_1 %bb.3, implicit %eflags
+
+ bb.2:
+ successors: %bb.3
+ liveins: %cl, %eax, %ebp, %esi
+
+ OR32mr %esp, 1, _, 8, _, killed %eax, implicit-def %eflags ; :: (store 4 into %stack.1)
+ %dl = SETNEr implicit %eflags, implicit-def %edx
+
+ bb.3:
+ liveins: %cl, %ebp, %edx, %esi
+
+ %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+ %esp = ADD32ri8 %esp, 36, implicit-def dead %eflags
+ %esi = POP32r implicit-def %esp, implicit %esp
+ %edi = POP32r implicit-def %esp, implicit %esp
+ %ebx = POP32r implicit-def %esp, implicit %esp
+ %ebp = POP32r implicit-def %esp, implicit %esp
+ RET 0, %eax
+
+...
diff --git a/test/CodeGen/X86/pr28173.ll b/test/CodeGen/X86/pr28173.ll
new file mode 100644
index 000000000000..31ea4ffb5616
--- /dev/null
+++ b/test/CodeGen/X86/pr28173.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mattr=+avx512f < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Note that the kmovs should really *not* appear in the output, this is an
+; artifact of the current poor lowering. This is tracked by PR28175.
+
+define i64 @foo64(i1 zeroext %i, i32 %j) #0 {
+; CHECK-LABEL: foo64:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill
+; CHECK-NEXT: orq $-2, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ br label %bb
+
+bb:
+ %z = zext i1 %i to i64
+ %v = or i64 %z, -2
+ br label %end
+
+end:
+ ret i64 %v
+}
+
+define i16 @foo16(i1 zeroext %i, i32 %j) #0 {
+; CHECK-LABEL: foo16:
+; CHECK: # BB#0:
+; CHECK-NEXT: orl $65534, %edi # imm = 0xFFFE
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ br label %bb
+
+bb:
+ %z = zext i1 %i to i16
+ %v = or i16 %z, -2
+ br label %end
+
+end:
+ ret i16 %v
+}
diff --git a/test/CodeGen/X86/pr28444.ll b/test/CodeGen/X86/pr28444.ll
new file mode 100644
index 000000000000..452f01c166b7
--- /dev/null
+++ b/test/CodeGen/X86/pr28444.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 < %s | FileCheck %s
+; https://llvm.org/bugs/show_bug.cgi?id=28444
+
+; extract_vector_elt is allowed to have a different result type than
+; the vector scalar type.
+; This uses both
+; i8 = extract_vector_elt v1i1, Constant:i64<0>
+; i1 = extract_vector_elt v1i1, Constant:i64<0>
+
+
+; CHECK-LABEL: {{^}}extractelt_mismatch_vector_element_type:
+; CHECK: movb $1, %al
+; CHECK: movb %al
+; CHECK: movb %al
+define void @extractelt_mismatch_vector_element_type(i32 %arg) {
+bb:
+ %tmp = icmp ult i32 %arg, 0
+ %tmp2 = insertelement <1 x i1> undef, i1 true, i32 0
+ %tmp3 = select i1 %tmp, <1 x i1> undef, <1 x i1> %tmp2
+ %tmp6 = extractelement <1 x i1> %tmp3, i32 0
+ br label %bb1
+
+bb1:
+ store volatile <1 x i1> %tmp3, <1 x i1>* undef
+ store volatile i1 %tmp6, i1* undef
+ ret void
+}
diff --git a/test/CodeGen/X86/pr28472.ll b/test/CodeGen/X86/pr28472.ll
new file mode 100644
index 000000000000..9d2609022b3d
--- /dev/null
+++ b/test/CodeGen/X86/pr28472.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}same_dynamic_index_fp_vector_type:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+define float @same_dynamic_index_fp_vector_type(float %val, i32 %idx) {
+bb:
+ %tmp0 = insertelement <4 x float> undef, float %val, i32 %idx
+ %tmp1 = extractelement <4 x float> %tmp0, i32 %idx
+ ret float %tmp1
+}
diff --git a/test/CodeGen/X86/pr28489.ll b/test/CodeGen/X86/pr28489.ll
new file mode 100644
index 000000000000..898b0870b65d
--- /dev/null
+++ b/test/CodeGen/X86/pr28489.ll
@@ -0,0 +1,15 @@
+; ; RUN: llc < %s -mtriple=i686-pc-linux -O0 | FileCheck %s
+declare void @g(i32, i1)
+
+;CHECK-LABEL: f:
+;CHECK: cmpxchg8b
+;CHECK: sete %cl
+;CHECK: movzbl %cl
+define void @f(i64* %arg, i64 %arg1) {
+entry:
+ %tmp5 = cmpxchg i64* %arg, i64 %arg1, i64 %arg1 seq_cst seq_cst
+ %tmp7 = extractvalue { i64, i1 } %tmp5, 1
+ %tmp9 = zext i1 %tmp7 to i32
+ call void @g(i32 %tmp9, i1 %tmp7)
+ ret void
+}
diff --git a/test/CodeGen/X86/pr28515.ll b/test/CodeGen/X86/pr28515.ll
new file mode 100644
index 000000000000..1fad26506668
--- /dev/null
+++ b/test/CodeGen/X86/pr28515.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s
+
+@0 = private constant [8 x i32] zeroinitializer
+
+; CHECK-LABEL: foo:
+; CHECK: movl %esi, (%rdi)
+; CHECK-NEXT: retq
+define void @foo(i32* %p, i32 %v, <8 x i1> %mask) {
+ store i32 %v, i32* %p
+ %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @0, i64 0, i64 0) to <8 x i32>*), i32 4, <8 x i1> %mask, <8 x i32> undef)
+ ret void
+}
+
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) #0
+
+attributes #0 = { argmemonly nounwind readonly }
diff --git a/test/CodeGen/X86/pr28560.ll b/test/CodeGen/X86/pr28560.ll
new file mode 100644
index 000000000000..d0061f670cf1
--- /dev/null
+++ b/test/CodeGen/X86/pr28560.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=i686-pc-linux -print-after=postrapseudos < %s 2>&1 | FileCheck %s
+
+; CHECK: MOV8rr %{{[A-D]}}L, %E[[R:[A-D]]]X<imp-use,kill>, %E[[R]]X<imp-def>
+define i32 @foo(i32 %i, i32 %k, i8* %p) {
+ %f = icmp ne i32 %i, %k
+ %s = zext i1 %f to i8
+ %ret = zext i1 %f to i32
+ br label %next
+next:
+ %d = add i8 %s, 5
+ store i8 %d, i8* %p
+ ret i32 %ret
+}
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
index 4dee5f8d7d2a..259d55b030e5 100644
--- a/test/CodeGen/X86/pr5145.ll
+++ b/test/CodeGen/X86/pr5145.ll
@@ -5,26 +5,26 @@ define void @atomic_maxmin_i8() {
; CHECK: atomic_maxmin_i8
%1 = atomicrmw max i8* @sc8, i8 5 acquire
; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movsbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: jg
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL1]]
%2 = atomicrmw min i8* @sc8, i8 6 acquire
; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movsbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: jl
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL3]]
%3 = atomicrmw umax i8* @sc8, i8 7 acquire
; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movzbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: ja
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL5]]
%4 = atomicrmw umin i8* @sc8, i8 8 acquire
; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: movzbl
-; CHECK: cmpl
+; CHECK: cmpb
+; CHECK: jb
; CHECK: lock cmpxchgb
; CHECK: jne [[LABEL7]]
ret void
diff --git a/test/CodeGen/X86/promote-i16.ll b/test/CodeGen/X86/promote-i16.ll
index 963bc1c2927a..7eb367480d76 100644
--- a/test/CodeGen/X86/promote-i16.ll
+++ b/test/CodeGen/X86/promote-i16.ll
@@ -1,11 +1,12 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
define signext i16 @foo(i16 signext %x) nounwind {
entry:
; CHECK-LABEL: foo:
-; CHECK-NOT: movzwl
-; CHECK: movswl 4(%esp), %eax
-; CHECK: xorl $21998, %eax
+; CHECK: movzwl 4(%esp), %eax
+; CHECK-NEXT: xorl $21998, %eax
+; CHECK-NEXT: # kill
+; CHECK-NEXT: retl
%0 = xor i16 %x, 21998
ret i16 %0
}
@@ -13,9 +14,10 @@ entry:
define signext i16 @bar(i16 signext %x) nounwind {
entry:
; CHECK-LABEL: bar:
-; CHECK-NOT: movzwl
-; CHECK: movswl 4(%esp), %eax
-; CHECK: xorl $-10770, %eax
+; CHECK: movzwl 4(%esp), %eax
+; CHECK-NEXT: xorl $54766, %eax
+; CHECK-NEXT: # kill
+; CHECK-NEXT: retl
%0 = xor i16 %x, 54766
ret i16 %0
}
diff --git a/test/CodeGen/X86/ps4-noreturn.ll b/test/CodeGen/X86/ps4-noreturn.ll
new file mode 100644
index 000000000000..4c14f2189325
--- /dev/null
+++ b/test/CodeGen/X86/ps4-noreturn.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-scei-ps4 | FileCheck %s
+
+declare i32 @personality(...)
+
+; Check that after the (implicitly noreturn) unwind call, there is
+; another instruction. It was easy to produce 'ud2' so we check for that.
+define void @foo1() personality i32 (...)* @personality {
+; CHECK-LABEL: foo1:
+; CHECK: .cfi_startproc
+; CHECK: callq bar
+; CHECK: retq
+; Check for 'ud2' between noreturn call and function end.
+; CHECK: callq _Unwind_Resume
+; CHECK-NEXT: ud2
+; CHECK-NEXT: .Lfunc_end0:
+ invoke void @bar()
+ to label %normal
+ unwind label %catch
+normal:
+ ret void
+catch:
+ %1 = landingpad { i8*, i32 } cleanup
+ resume { i8*, i32 } %1
+}
+
+declare void @bar() #0
+
+; Similar check after an explicit noreturn call.
+define void @foo2() {
+; CHECK-LABEL: foo2:
+; CHECK: callq bar
+; CHECK-NEXT: ud2
+; CHECK-NEXT: .Lfunc_end1:
+ tail call void @bar()
+ unreachable
+}
+
+attributes #0 = { noreturn }
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 105a035be592..8364915fa0d0 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -1,10 +1,13 @@
-; RUN: llc < %s -march=x86-64 -mattr=+ssse3 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s
; Test that the pshufb mask comment is correct.
define <16 x i8> @test1(<16 x i8> %V) {
; CHECK-LABEL: test1:
-; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
+; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>)
ret <16 x i8> %1
}
@@ -13,7 +16,9 @@ define <16 x i8> @test1(<16 x i8> %V) {
define <16 x i8> @test2(<16 x i8> %V) {
; CHECK-LABEL: test2:
-; CHECK: pshufb {{.*}}# xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
+; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>)
ret <16 x i8> %1
}
@@ -22,31 +27,64 @@ define <16 x i8> @test2(<16 x i8> %V) {
define <16 x i8> @test3(<16 x i8> %V) {
; CHECK-LABEL: test3:
-; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
+; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>)
ret <16 x i8> %1
}
; Test that we won't crash when the constant was reused for another instruction.
-define <16 x i8> @test4(<2 x i64>* %V) {
-; CHECK-LABEL: test4
-; CHECK: pshufb {{.*}}
- store <2 x i64> <i64 1084818905618843912, i64 506097522914230528>, <2 x i64>* %V, align 16
- %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
- ret <16 x i8> %1
+define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) {
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1084818905618843912,506097522914230528]
+; CHECK-NEXT: movaps %xmm1, (%rdi)
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: retq
+ %1 = insertelement <2 x i64> undef, i64 1084818905618843912, i32 0
+ %2 = insertelement <2 x i64> %1, i64 506097522914230528, i32 1
+ store <2 x i64> %2, <2 x i64>* %P, align 16
+ %3 = bitcast <2 x i64> %2 to <16 x i8>
+ %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> %3)
+ ret <16 x i8> %4
}
-define <16 x i8> @test5() {
-; CHECK-LABEL: test5
-; CHECK: pshufb {{.*}}
+define <16 x i8> @test5(<16 x i8> %V) {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: movd %rax, %xmm1
+; CHECK-NEXT: movaps %xmm1, (%rax)
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT: movdqa %xmm1, (%rax)
+; CHECK-NEXT: pshufb %xmm1, %xmm0
+; CHECK-NEXT: retq
store <2 x i64> <i64 1, i64 0>, <2 x i64>* undef, align 16
%l = load <2 x i64>, <2 x i64>* undef, align 16
%shuffle = shufflevector <2 x i64> %l, <2 x i64> undef, <2 x i32> zeroinitializer
store <2 x i64> %shuffle, <2 x i64>* undef, align 16
%1 = load <16 x i8>, <16 x i8>* undef, align 16
- %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> %1)
+ %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> %1)
ret <16 x i8> %2
}
+; Test for a reused constant that would allow the pshufb to combine to a simpler instruction.
+
+define <16 x i8> @test6(<16 x i8> %V, <2 x i64>* %P) {
+; CHECK-LABEL: test6:
+; CHECK: # BB#0:
+; CHECK-NEXT: movaps {{.*#+}} xmm1 = [217019414673948672,506380106026255364]
+; CHECK-NEXT: movaps %xmm1, (%rdi)
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: retq
+ %1 = insertelement <2 x i64> undef, i64 217019414673948672, i32 0
+ %2 = insertelement <2 x i64> %1, i64 506380106026255364, i32 1
+ store <2 x i64> %2, <2 x i64>* %P, align 16
+ %3 = bitcast <2 x i64> %2 to <16 x i8>
+ %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> %3)
+ ret <16 x i8> %4
+}
+
declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index c6d118d6da69..a63d1c60e379 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -54,30 +54,21 @@ vector.ph:
}
define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
-; SSE2-LABEL: test3:
-; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: movdqu (%rdi), %xmm1
-; SSE2-NEXT: psubusw %xmm0, %xmm1
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: test3:
-; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: psubusw %xmm0, %xmm1
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
-; SSSE3-NEXT: retq
+; SSE-LABEL: test3:
+; SSE: ## BB#0: ## %vector.ph
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: movdqu (%rdi), %xmm1
+; SSE-NEXT: psubusw %xmm0, %xmm1
+; SSE-NEXT: movdqu %xmm1, (%rdi)
+; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
; AVX1: ## BB#0: ## %vector.ph
; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vmovdqu (%rdi), %xmm1
; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
@@ -159,9 +150,8 @@ define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: psubusb %xmm0, %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rdi)
@@ -304,46 +294,34 @@ vector.ph:
}
define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
-; SSE2-LABEL: test9:
-; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: movdqu (%rdi), %xmm1
-; SSE2-NEXT: movdqu 16(%rdi), %xmm2
-; SSE2-NEXT: psubusw %xmm0, %xmm1
-; SSE2-NEXT: psubusw %xmm0, %xmm2
-; SSE2-NEXT: movdqu %xmm2, 16(%rdi)
-; SSE2-NEXT: movdqu %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: test9:
-; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: movd %esi, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: movdqu (%rdi), %xmm1
-; SSSE3-NEXT: movdqu 16(%rdi), %xmm2
-; SSSE3-NEXT: psubusw %xmm0, %xmm1
-; SSSE3-NEXT: psubusw %xmm0, %xmm2
-; SSSE3-NEXT: movdqu %xmm2, 16(%rdi)
-; SSSE3-NEXT: movdqu %xmm1, (%rdi)
-; SSSE3-NEXT: retq
+; SSE-LABEL: test9:
+; SSE: ## BB#0: ## %vector.ph
+; SSE-NEXT: movd %esi, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: movdqu (%rdi), %xmm1
+; SSE-NEXT: movdqu 16(%rdi), %xmm2
+; SSE-NEXT: psubusw %xmm0, %xmm1
+; SSE-NEXT: psubusw %xmm0, %xmm2
+; SSE-NEXT: movdqu %xmm2, 16(%rdi)
+; SSE-NEXT: movdqu %xmm1, (%rdi)
+; SSE-NEXT: retq
;
; AVX1-LABEL: test9:
; AVX1: ## BB#0: ## %vector.ph
; AVX1-NEXT: vmovups (%rdi), %ymm0
-; AVX1-NEXT: vmovd %esi, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovd %esi, %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
@@ -471,9 +449,8 @@ define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
; SSE2: ## BB#0: ## %vector.ph
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: movdqu (%rdi), %xmm1
; SSE2-NEXT: movdqu 16(%rdi), %xmm2
; SSE2-NEXT: psubusb %xmm0, %xmm1
diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll
index cc00fab525ab..7f438e306e4d 100644
--- a/test/CodeGen/X86/push-cfi-debug.ll
+++ b/test/CodeGen/X86/push-cfi-debug.ll
@@ -23,7 +23,7 @@ declare x86_stdcallcc void @stdfoo(i32, i32) #0
; CHECK: .cfi_adjust_cfa_offset 4
; CHECK: calll stdfoo
; CHECK: .cfi_adjust_cfa_offset -8
-; CHECK: addl $8, %esp
+; CHECK: addl $20, %esp
; CHECK: .cfi_adjust_cfa_offset -8
define void @test1() #0 !dbg !4 {
entry:
@@ -38,11 +38,10 @@ attributes #0 = { nounwind optsize }
!llvm.module.flags = !{!7, !8}
!llvm.ident = !{!9}
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250289)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250289)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "foo.c", directory: "foo")
!2 = !{}
-!3 = !{!4}
-!4 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, variables: !2)
+!4 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, variables: !2)
!5 = !DISubroutineType(types: !6)
!6 = !{null}
!7 = !{i32 2, !"Dwarf Version", i32 4}
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 6389708f42cc..f0772fc28c63 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -82,7 +82,7 @@ cleanup:
; LINUX-NEXT: Ltmp{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
-; LINUX-NEXT: addl $16, %esp
+; LINUX-NEXT: addl $28, %esp
; LINUX: .cfi_adjust_cfa_offset -16
; DARWIN-NOT: .cfi_escape
; DARWIN-NOT: pushl
diff --git a/test/CodeGen/X86/ragreedy-hoist-spill.ll b/test/CodeGen/X86/ragreedy-hoist-spill.ll
index 46b65bd24fc0..1d6b4f94731b 100644
--- a/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
; This testing case is reduced from 254.gap SyFgets function.
-; We make sure a spill is not hoisted to a hotter outer loop.
+; We make sure a spill is hoisted to a cold BB inside the hotter outer loop.
%struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
%struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
@@ -181,6 +181,10 @@ sw.bb474:
br i1 %cmp476, label %if.end517, label %do.body479.preheader
do.body479.preheader:
+ ; CHECK: do.body479.preheader
+ ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
+ ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+ ; CHECK: land.rhs485
%cmp4833314 = icmp eq i8 undef, 0
br i1 %cmp4833314, label %if.end517, label %land.rhs485
@@ -200,8 +204,8 @@ land.lhs.true490:
lor.rhs500:
; CHECK: lor.rhs500
- ; Make sure that we don't hoist the spill to outer loops.
- ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+ ; Make sure spill is hoisted to a cold preheader in outside loop.
+ ; CHECK-NOT: movq %r{{.*}}, {{[0-9]+}}(%rsp)
; CHECK: callq {{.*}}maskrune
%call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
br i1 undef, label %land.lhs.true504, label %do.body479.backedge
diff --git a/test/CodeGen/X86/reduce-trunc-shl.ll b/test/CodeGen/X86/reduce-trunc-shl.ll
new file mode 100644
index 000000000000..74612df4dd36
--- /dev/null
+++ b/test/CodeGen/X86/reduce-trunc-shl.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+define void @trunc_shl_7_v4i32_v4i64(<4 x i32> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+; SSE2-LABEL: trunc_shl_7_v4i32_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: pslld $7, %xmm1
+; SSE2-NEXT: movdqa %xmm1, (%rdi)
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: trunc_shl_7_v4i32_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpslld $7, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %val = load <4 x i64>, <4 x i64> addrspace(1)* %in
+ %shl = shl <4 x i64> %val, <i64 7, i64 7, i64 7, i64 7>
+ %trunc = trunc <4 x i64> %shl to <4 x i32>
+ store <4 x i32> %trunc, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
index 016b0d13fc4a..ba8ff1bc1819 100644
--- a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
+++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll
@@ -37,7 +37,7 @@ declare noalias i32* @make_data()
; We use to produce a useless copy here and move %data in another temporary register.
; CHECK-NOT: movq [[ARG1]]
; End of the first basic block.
-; CHECK: .align
+; CHECK: .p2align
; Now check that %data is used in an address computation.
; CHECK: leaq ([[ARG1]]
define %struct._list* @make_list(i32* nocapture readonly %data, i32* nocapture %value, i32* nocapture %all) {
diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll
index 733b7942a6d5..cc591e5ac00b 100644
--- a/test/CodeGen/X86/rem.ll
+++ b/test/CodeGen/X86/rem.ll
@@ -1,37 +1,84 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
-; CHECK-LABEL: test1:
-; CHECK-NOT: div
define i32 @test1(i32 %X) {
- %tmp1 = srem i32 %X, 255 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: imull %edx
+; CHECK-NEXT: addl %ecx, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shrl $31, %eax
+; CHECK-NEXT: sarl $7, %edx
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shll $8, %eax
+; CHECK-NEXT: subl %edx, %eax
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = srem i32 %X, 255
+ ret i32 %tmp1
}
-; CHECK-LABEL: test2:
-; CHECK-NOT: div
define i32 @test2(i32 %X) {
- %tmp1 = srem i32 %X, 256 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: sarl $31, %ecx
+; CHECK-NEXT: shrl $24, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: andl $-256, %ecx
+; CHECK-NEXT: subl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = srem i32 %X, 256
+ ret i32 %tmp1
}
-; CHECK-LABEL: test3:
-; CHECK-NOT: div
define i32 @test3(i32 %X) {
- %tmp1 = urem i32 %X, 255 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: mull %edx
+; CHECK-NEXT: shrl $7, %edx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: shll $8, %eax
+; CHECK-NEXT: subl %edx, %eax
+; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = urem i32 %X, 255
+ ret i32 %tmp1
}
-; CHECK-LABEL: test4:
-; CHECK-NOT: div
define i32 @test4(i32 %X) {
- %tmp1 = urem i32 %X, 256 ; <i32> [#uses=1]
- ret i32 %tmp1
+; CHECK-LABEL: test4:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: retl
+;
+ %tmp1 = urem i32 %X, 256
+ ret i32 %tmp1
}
-; CHECK-LABEL: test5:
-; CHECK-NOT: cltd
define i32 @test5(i32 %X) nounwind readnone {
+; CHECK-LABEL: test5:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movl $41, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: idivl {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retl
+;
entry:
- %0 = srem i32 41, %X
- ret i32 %0
+ %0 = srem i32 41, %X
+ ret i32 %0
}
diff --git a/test/CodeGen/X86/rem_crash.ll b/test/CodeGen/X86/rem_crash.ll
index 8363b22ab65f..a5529a769a0b 100644
--- a/test/CodeGen/X86/rem_crash.ll
+++ b/test/CodeGen/X86/rem_crash.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s
+; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86-64
define i8 @test_minsize_uu8(i8 %x) minsize optsize {
entry:
diff --git a/test/CodeGen/X86/return-ext.ll b/test/CodeGen/X86/return-ext.ll
new file mode 100644
index 000000000000..ef160f43b4aa
--- /dev/null
+++ b/test/CodeGen/X86/return-ext.ll
@@ -0,0 +1,138 @@
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -fixup-byte-word-insts=0 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -fixup-byte-word-insts=1 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWON %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -fixup-byte-word-insts=0 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -fixup-byte-word-insts=1 | \
+; RUN: FileCheck -check-prefix=CHECK -check-prefix=BWON %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -fixup-byte-word-insts=0 | \
+; RUN: FileCheck -check-prefix=DARWIN -check-prefix=DARWIN-BWOFF %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -fixup-byte-word-insts=1 | \
+; RUN: FileCheck -check-prefix=DARWIN -check-prefix=DARWIN-BWON %s
+
+
+@x = common global i32 0, align 4
+
+define zeroext i1 @unsigned_i1() {
+entry:
+ %0 = load i32, i32* @x
+ %cmp = icmp eq i32 %0, 42
+ ret i1 %cmp
+
+; Unsigned i1 return values are not extended.
+; CHECK-LABEL: unsigned_i1:
+; CHECK: cmp
+; CHECK-NEXT: sete
+; CHECK-NEXT: ret
+}
+
+define zeroext i8 @unsigned_i8() {
+entry:
+ %0 = load i32, i32* @x
+ %cmp = icmp eq i32 %0, 42
+ %retval = zext i1 %cmp to i8
+ ret i8 %retval
+
+; Unsigned i8 return values are not extended.
+; CHECK-LABEL: unsigned_i8:
+; CHECK: cmp
+; CHECK-NEXT: sete
+; CHECK-NEXT: ret
+
+; Except on Darwin, for legacy reasons.
+; DARWIN-LABEL: unsigned_i8:
+; DARWIN: xorl
+; DARWIN-NEXT: cmp
+; DARWIN-NEXT: sete
+; DARWIN-NEXT: ret
+}
+
+define signext i8 @signed_i8() {
+entry:
+ %0 = load i32, i32* @x
+ %cmp = icmp eq i32 %0, 42
+ %retval = zext i1 %cmp to i8
+ ret i8 %retval
+
+; Signed i8 return values are not extended.
+; CHECK-LABEL: signed_i8:
+; CHECK: cmp
+; CHECK-NEXT: sete
+; CHECK-NEXT: ret
+
+; Except on Darwin, for legacy reasons.
+; DARWIN-LABEL: signed_i8:
+; DARWIN: xorl
+; DARWIN-NEXT: cmp
+; DARWIN-NEXT: sete
+; DARWIN-NEXT: ret
+}
+
+@a = common global i16 0
+@b = common global i16 0
+define zeroext i16 @unsigned_i16() {
+entry:
+ %0 = load i16, i16* @a
+ %1 = load i16, i16* @b
+ %add = add i16 %1, %0
+ ret i16 %add
+
+; i16 return values are not extended.
+; CHECK-LABEL: unsigned_i16:
+; BWOFF: movw
+; BWON: movzwl
+; CHECK-NEXT: addw
+; CHECK-NEXT: ret
+
+; Except on Darwin, for legacy reasons.
+; DARWIN-LABEL: unsigned_i16:
+; DARWIN-BWOFF: movw
+; DARWIN-BWON: movzwl
+; DARWIN-NEXT: addw
+; DARWIN-NEXT: movzwl
+; DARWIN-NEXT: ret
+}
+
+
+define i32 @use_i1() {
+entry:
+ %0 = call i1 @unsigned_i1();
+ %1 = zext i1 %0 to i32
+ ret i32 %1
+
+; The high 24 bits of %eax from a function returning i1 are undefined.
+; CHECK-LABEL: use_i1:
+; CHECK: call
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: ret
+}
+
+define i32 @use_i8() {
+entry:
+ %0 = call i8 @unsigned_i8();
+ %1 = zext i8 %0 to i32
+ ret i32 %1
+
+; The high 24 bits of %eax from a function returning i8 are undefined.
+; CHECK-LABEL: use_i8:
+; CHECK: call
+; CHECK-NEXT: movzbl
+; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: ret
+}
+
+define i32 @use_i16() {
+entry:
+ %0 = call i16 @unsigned_i16();
+ %1 = zext i16 %0 to i32
+ ret i32 %1
+
+; The high 16 bits of %eax from a function returning i16 are undefined.
+; CHECK-LABEL: use_i16:
+; CHECK: call
+; CHECK-NEXT: movzwl
+; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index 76eb9514f02c..fb06cac45fff 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -2,7 +2,8 @@
declare i32 @llvm.x86.xbegin() nounwind
declare void @llvm.x86.xend() nounwind
-declare void @llvm.x86.xabort(i8) noreturn nounwind
+declare void @llvm.x86.xabort(i8) nounwind
+declare void @f1()
define i32 @test_xbegin() nounwind uwtable {
entry:
@@ -24,7 +25,20 @@ entry:
define void @test_xabort() nounwind uwtable {
entry:
tail call void @llvm.x86.xabort(i8 2)
- unreachable
+ ret void
; CHECK: test_xabort
; CHECK: xabort $2
}
+
+define void @f2(i32 %x) nounwind uwtable {
+entry:
+ %x.addr = alloca i32, align 4
+ store i32 %x, i32* %x.addr, align 4
+ call void @llvm.x86.xabort(i8 1)
+ call void @f1()
+ ret void
+; CHECK-LABEL: f2
+; CHECK: xabort $1
+; CHECK: callq f1
+}
+ \ No newline at end of file
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
new file mode 100644
index 000000000000..17a933e50d0d
--- /dev/null
+++ b/test/CodeGen/X86/sad.ll
@@ -0,0 +1,1001 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+
+@a = global [1024 x i8] zeroinitializer, align 16
+@b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @sad_16i8() nounwind {
+; SSE2-LABEL: sad_16i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB0_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqu a+1024(%rax), %xmm2
+; SSE2-NEXT: movdqu b+1024(%rax), %xmm3
+; SSE2-NEXT: psadbw %xmm2, %xmm3
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB0_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_16i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB0_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2
+; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB0_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_16i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB0_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1
+; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
+; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB0_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_16i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB0_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1
+; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB0_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <16 x i8>*
+ %wide.load = load <16 x i8>, <16 x i8>* %1, align 4
+ %2 = zext <16 x i8> %wide.load to <16 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <16 x i8>*
+ %wide.load1 = load <16 x i8>, <16 x i8>* %4, align 4
+ %5 = zext <16 x i8> %wide.load1 to <16 x i32>
+ %6 = sub nsw <16 x i32> %2, %5
+ %7 = icmp sgt <16 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %8 = sub nsw <16 x i32> zeroinitializer, %6
+ %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8
+ %10 = add nsw <16 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <16 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <16 x i32> %.lcssa, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <16 x i32> %.lcssa, %rdx.shuf
+ %rdx.shuf2 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx2 = add <16 x i32> %bin.rdx, %rdx.shuf2
+ %rdx.shuf3 = shufflevector <16 x i32> %bin.rdx2, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx3 = add <16 x i32> %bin.rdx2, %rdx.shuf3
+ %rdx.shuf4 = shufflevector <16 x i32> %bin.rdx3, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx4 = add <16 x i32> %bin.rdx3, %rdx.shuf4
+ %12 = extractelement <16 x i32> %bin.rdx4, i32 0
+ ret i32 %12
+}
+
+define i32 @sad_32i8() nounwind {
+; SSE2-LABEL: sad_32i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm14, %xmm14
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB1_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm1
+; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
+; SSE2-NEXT: movdqa b+1040(%rax), %xmm2
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm10, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm2, %xmm6
+; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: movdqa %xmm9, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm9, %xmm7
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm11, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; SSE2-NEXT: psubd %xmm11, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; SSE2-NEXT: psubd %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm8
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm7, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm7
+; SSE2-NEXT: pxor %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm10, %xmm4
+; SSE2-NEXT: paddd %xmm1, %xmm15
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: paddd %xmm6, %xmm0
+; SSE2-NEXT: paddd %xmm7, %xmm14
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm5, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB1_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: paddd %xmm15, %xmm4
+; SSE2-NEXT: paddd %xmm14, %xmm1
+; SSE2-NEXT: paddd %xmm13, %xmm0
+; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_32i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB1_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB1_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_32i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB1_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB1_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_32i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB1_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2
+; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB1_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <32 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <32 x i8>*
+ %wide.load = load <32 x i8>, <32 x i8>* %1, align 32
+ %2 = zext <32 x i8> %wide.load to <32 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <32 x i8>*
+ %wide.load1 = load <32 x i8>, <32 x i8>* %4, align 32
+ %5 = zext <32 x i8> %wide.load1 to <32 x i32>
+ %6 = sub nsw <32 x i32> %2, %5
+ %7 = icmp sgt <32 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %8 = sub nsw <32 x i32> zeroinitializer, %6
+ %9 = select <32 x i1> %7, <32 x i32> %6, <32 x i32> %8
+ %10 = add nsw <32 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <32 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <32 x i32> %.lcssa, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <32 x i32> %.lcssa, %rdx.shuf
+ %rdx.shuf2 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx2 = add <32 x i32> %bin.rdx, %rdx.shuf2
+ %rdx.shuf3 = shufflevector <32 x i32> %bin.rdx2, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx3 = add <32 x i32> %bin.rdx2, %rdx.shuf3
+ %rdx.shuf4 = shufflevector <32 x i32> %bin.rdx3, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx4 = add <32 x i32> %bin.rdx3, %rdx.shuf4
+ %rdx.shuf5 = shufflevector <32 x i32> %bin.rdx4, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx5 = add <32 x i32> %bin.rdx4, %rdx.shuf5
+ %12 = extractelement <32 x i32> %bin.rdx5, i32 0
+ ret i32 %12
+}
+
+define i32 @sad_avx64i8() nounwind {
+; SSE2-LABEL: sad_avx64i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: subq $216, %rsp
+; SSE2-NEXT: pxor %xmm6, %xmm6
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm10, %xmm10
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm15, %xmm15
+; SSE2-NEXT: pxor %xmm11, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB2_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa a+1040(%rax), %xmm13
+; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
+; SSE2-NEXT: movdqa a+1056(%rax), %xmm10
+; SSE2-NEXT: movdqa a+1072(%rax), %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm12, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm13, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
+; SSE2-NEXT: movdqa b+1040(%rax), %xmm7
+; SSE2-NEXT: movdqa b+1024(%rax), %xmm11
+; SSE2-NEXT: movdqa b+1056(%rax), %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm7, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm7, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm11, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm2, %xmm15
+; SSE2-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm9, %xmm10
+; SSE2-NEXT: movdqa %xmm5, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm15
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm0, %xmm15
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm14, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm14, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm14
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm0, %xmm11
+; SSE2-NEXT: movdqa b+1072(%rax), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm0, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm5, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; SSE2-NEXT: psubd %xmm2, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: psubd %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm7, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm7
+; SSE2-NEXT: pxor %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm9
+; SSE2-NEXT: pxor %xmm2, %xmm9
+; SSE2-NEXT: movdqa %xmm8, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm8
+; SSE2-NEXT: pxor %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm11, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm11
+; SSE2-NEXT: pxor %xmm2, %xmm11
+; SSE2-NEXT: movdqa %xmm14, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm14
+; SSE2-NEXT: pxor %xmm2, %xmm14
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm15, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm15
+; SSE2-NEXT: pxor %xmm2, %xmm15
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm4, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm10, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm2, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm12, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm12
+; SSE2-NEXT: pxor %xmm2, %xmm12
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm13, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm13
+; SSE2-NEXT: pxor %xmm2, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm13, %xmm5
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm10, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm15, %xmm3
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm14, %xmm15
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm9, %xmm4
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm7, %xmm5
+; SSE2-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm0, %xmm7
+; SSE2-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB2_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: paddd %xmm15, %xmm3
+; SSE2-NEXT: paddd %xmm5, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm8, %xmm13
+; SSE2-NEXT: paddd %xmm11, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm5
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm12
+; SSE2-NEXT: paddd %xmm3, %xmm10
+; SSE2-NEXT: paddd %xmm13, %xmm10
+; SSE2-NEXT: paddd %xmm0, %xmm12
+; SSE2-NEXT: paddd %xmm5, %xmm12
+; SSE2-NEXT: paddd %xmm10, %xmm12
+; SSE2-NEXT: paddd %xmm6, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,0,1]
+; SSE2-NEXT: paddd %xmm12, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: addq $216, %rsp
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_avx64i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5
+; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB2_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9
+; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload
+; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15
+; AVX2-NEXT: vpabsd %ymm8, %ymm8
+; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3
+; AVX2-NEXT: vpabsd %ymm14, %ymm8
+; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1
+; AVX2-NEXT: vpabsd %ymm13, %ymm8
+; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2
+; AVX2-NEXT: vpabsd %ymm12, %ymm8
+; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vpabsd %ymm11, %ymm8
+; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4
+; AVX2-NEXT: vpabsd %ymm10, %ymm8
+; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6
+; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5
+; AVX2-NEXT: vpabsd %ymm15, %ymm8
+; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB2_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_avx64i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB2_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7
+; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6
+; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5
+; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4
+; AVX512F-NEXT: vpabsd %zmm4, %zmm4
+; AVX512F-NEXT: vpabsd %zmm5, %zmm5
+; AVX512F-NEXT: vpabsd %zmm6, %zmm6
+; AVX512F-NEXT: vpabsd %zmm7, %zmm7
+; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3
+; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2
+; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1
+; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB2_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_avx64i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB2_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2
+; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB2_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <64 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <64 x i8>*
+ %wide.load = load <64 x i8>, <64 x i8>* %1, align 64
+ %2 = zext <64 x i8> %wide.load to <64 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <64 x i8>*
+ %wide.load1 = load <64 x i8>, <64 x i8>* %4, align 64
+ %5 = zext <64 x i8> %wide.load1 to <64 x i32>
+ %6 = sub nsw <64 x i32> %2, %5
+ %7 = icmp sgt <64 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %8 = sub nsw <64 x i32> zeroinitializer, %6
+ %9 = select <64 x i1> %7, <64 x i32> %6, <64 x i32> %8
+ %10 = add nsw <64 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <64 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <64 x i32> %.lcssa, <64 x i32> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <64 x i32> %.lcssa, %rdx.shuf
+ %rdx.shuf2 = shufflevector <64 x i32> %bin.rdx, <64 x i32> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx2 = add <64 x i32> %bin.rdx, %rdx.shuf2
+ %rdx.shuf3 = shufflevector <64 x i32> %bin.rdx2, <64 x i32> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx3 = add <64 x i32> %bin.rdx2, %rdx.shuf3
+ %rdx.shuf4 = shufflevector <64 x i32> %bin.rdx3, <64 x i32> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx4 = add <64 x i32> %bin.rdx3, %rdx.shuf4
+ %rdx.shuf5 = shufflevector <64 x i32> %bin.rdx4, <64 x i32> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx5 = add <64 x i32> %bin.rdx4, %rdx.shuf5
+ %rdx.shuf6 = shufflevector <64 x i32> %bin.rdx5, <64 x i32> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx6 = add <64 x i32> %bin.rdx5, %rdx.shuf6
+ %12 = extractelement <64 x i32> %bin.rdx6, i32 0
+ ret i32 %12
+}
+
+define i32 @sad_2i8() nounwind {
+; SSE2-LABEL: sad_2i8:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB3_1: # %vector.body
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: psadbw %xmm3, %xmm2
+; SSE2-NEXT: paddq %xmm2, %xmm0
+; SSE2-NEXT: addq $4, %rax
+; SSE2-NEXT: jne .LBB3_1
+; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: paddq %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %eax
+; SSE2-NEXT: retq
+;
+; AVX2-LABEL: sad_2i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB3_1: # %vector.body
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: addq $4, %rax
+; AVX2-NEXT: jne .LBB3_1
+; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sad_2i8:
+; AVX512F: # BB#0: # %entry
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB3_1: # %vector.body
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: addq $4, %rax
+; AVX512F-NEXT: jne .LBB3_1
+; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: sad_2i8:
+; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB3_1: # %vector.body
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: addq $4, %rax
+; AVX512BW-NEXT: jne .LBB3_1
+; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: retq
+entry:
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %vec.phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ]
+ %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index
+ %1 = bitcast i8* %0 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %1, align 4
+ %2 = zext <2 x i8> %wide.load to <2 x i32>
+ %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index
+ %4 = bitcast i8* %3 to <2 x i8>*
+ %wide.load1 = load <2 x i8>, <2 x i8>* %4, align 4
+ %5 = zext <2 x i8> %wide.load1 to <2 x i32>
+ %6 = sub nsw <2 x i32> %2, %5
+ %7 = icmp sgt <2 x i32> %6, <i32 -1, i32 -1>
+ %8 = sub nsw <2 x i32> zeroinitializer, %6
+ %9 = select <2 x i1> %7, <2 x i32> %6, <2 x i32> %8
+ %10 = add nsw <2 x i32> %9, %vec.phi
+ %index.next = add i64 %index, 4
+ %11 = icmp eq i64 %index.next, 1024
+ br i1 %11, label %middle.block, label %vector.body
+
+middle.block:
+ %.lcssa = phi <2 x i32> [ %10, %vector.body ]
+ %rdx.shuf = shufflevector <2 x i32> %.lcssa, <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
+ %bin.rdx = add <2 x i32> %.lcssa, %rdx.shuf
+ %12 = extractelement <2 x i32> %bin.rdx, i32 0
+ ret i32 %12
+}
+
diff --git a/test/CodeGen/X86/safestack_ssp.ll b/test/CodeGen/X86/safestack_ssp.ll
new file mode 100644
index 000000000000..5a1a465158cf
--- /dev/null
+++ b/test/CodeGen/X86/safestack_ssp.ll
@@ -0,0 +1,27 @@
+; Test codegen pipeline for SafeStack + StackProtector combination.
+; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+
+define void @_Z1fv() safestack sspreq {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @_Z7CapturePi(i32* nonnull %x)
+ ret void
+}
+
+declare void @_Z7CapturePi(i32*)
+
+; LINUX-X64-DAG: movq __safestack_unsafe_stack_ptr@GOTTPOFF(%rip), %[[A:.*]]
+; LINUX-X64-DAG: movq %fs:(%[[A]]), %[[B:.*]]
+; LINUX-X64-DAG: movq %fs:40, %[[COOKIE:.*]]
+; LINUX-X64-DAG: leaq -16(%[[B]]), %[[C:.*]]
+; LINUX-X64-DAG: movq %[[C]], %fs:(%[[A]])
+; LINUX-X64-DAG: movq %[[COOKIE]], -8(%[[B]])
+
+; LINUX-I386-DAG: movl __safestack_unsafe_stack_ptr@INDNTPOFF, %[[A:.*]]
+; LINUX-I386-DAG: movl %gs:(%[[A]]), %[[B:.*]]
+; LINUX-I386-DAG: movl %gs:20, %[[COOKIE:.*]]
+; LINUX-I386-DAG: leal -16(%[[B]]), %[[C:.*]]
+; LINUX-I386-DAG: movl %[[C]], %gs:(%[[A]])
+; LINUX-I386-DAG: movl %[[COOKIE]], -4(%[[B]])
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 55eaab91da50..a0cd1824629a 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -44,7 +44,7 @@ define void @test_basic() #0 {
; X32-Linux-NEXT: ja .LBB0_2
; X32-Linux: pushl $0
-; X32-Linux-NEXT: pushl $60
+; X32-Linux-NEXT: pushl $44
; X32-Linux-NEXT: calll __morestack
; X32-Linux-NEXT: ret
@@ -105,7 +105,7 @@ define void @test_basic() #0 {
; X32-MinGW-NEXT: ja LBB0_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $48
+; X32-MinGW-NEXT: pushl $40
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -135,7 +135,7 @@ define void @test_basic() #0 {
; X32-DFlyBSD-NEXT: ja .LBB0_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $48
+; X32-DFlyBSD-NEXT: pushl $40
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -162,7 +162,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
; X32-Linux-NEXT: ja .LBB1_2
; X32-Linux: pushl $4
-; X32-Linux-NEXT: pushl $60
+; X32-Linux-NEXT: pushl $44
; X32-Linux-NEXT: calll __morestack
; X32-Linux-NEXT: ret
@@ -209,7 +209,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
; X32-MinGW-NEXT: ja LBB1_2
; X32-MinGW: pushl $4
-; X32-MinGW-NEXT: pushl $52
+; X32-MinGW-NEXT: pushl $44
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -238,7 +238,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
; X32-DFlyBSD-NEXT: ja .LBB1_2
; X32-DFlyBSD: pushl $4
-; X32-DFlyBSD-NEXT: pushl $52
+; X32-DFlyBSD-NEXT: pushl $44
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -305,12 +305,12 @@ define void @test_large() #0 {
; X64-Darwin-NEXT: callq ___morestack
; X64-Darwin-NEXT: ret
-; X32-MinGW: leal -40008(%esp), %ecx
+; X32-MinGW: leal -40000(%esp), %ecx
; X32-MinGW-NEXT: cmpl %fs:20, %ecx
; X32-MinGW-NEXT: ja LBB2_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $40008
+; X32-MinGW-NEXT: pushl $40000
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -333,12 +333,12 @@ define void @test_large() #0 {
; X64-FreeBSD-NEXT: callq __morestack
; X64-FreeBSD-NEXT: ret
-; X32-DFlyBSD: leal -40008(%esp), %ecx
+; X32-DFlyBSD: leal -40000(%esp), %ecx
; X32-DFlyBSD-NEXT: cmpl %fs:16, %ecx
; X32-DFlyBSD-NEXT: ja .LBB2_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $40008
+; X32-DFlyBSD-NEXT: pushl $40000
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -364,7 +364,7 @@ define fastcc void @test_fastcc() #0 {
; X32-Linux-NEXT: ja .LBB3_2
; X32-Linux: pushl $0
-; X32-Linux-NEXT: pushl $60
+; X32-Linux-NEXT: pushl $44
; X32-Linux-NEXT: calll __morestack
; X32-Linux-NEXT: ret
@@ -415,7 +415,7 @@ define fastcc void @test_fastcc() #0 {
; X32-MinGW-NEXT: ja LBB3_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $48
+; X32-MinGW-NEXT: pushl $40
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -445,7 +445,7 @@ define fastcc void @test_fastcc() #0 {
; X32-DFlyBSD-NEXT: ja .LBB3_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $48
+; X32-DFlyBSD-NEXT: pushl $40
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
@@ -524,12 +524,12 @@ define fastcc void @test_fastcc_large() #0 {
; X32-MinGW-LABEL: test_fastcc_large:
-; X32-MinGW: leal -40008(%esp), %eax
+; X32-MinGW: leal -40000(%esp), %eax
; X32-MinGW-NEXT: cmpl %fs:20, %eax
; X32-MinGW-NEXT: ja LBB4_2
; X32-MinGW: pushl $0
-; X32-MinGW-NEXT: pushl $40008
+; X32-MinGW-NEXT: pushl $40000
; X32-MinGW-NEXT: calll ___morestack
; X32-MinGW-NEXT: ret
@@ -557,12 +557,12 @@ define fastcc void @test_fastcc_large() #0 {
; X32-DFlyBSD-LABEL: test_fastcc_large:
-; X32-DFlyBSD: leal -40008(%esp), %eax
+; X32-DFlyBSD: leal -40000(%esp), %eax
; X32-DFlyBSD-NEXT: cmpl %fs:16, %eax
; X32-DFlyBSD-NEXT: ja .LBB4_2
; X32-DFlyBSD: pushl $0
-; X32-DFlyBSD-NEXT: pushl $40008
+; X32-DFlyBSD-NEXT: pushl $40000
; X32-DFlyBSD-NEXT: calll __morestack
; X32-DFlyBSD-NEXT: ret
diff --git a/test/CodeGen/X86/seh-catch-all-win32.ll b/test/CodeGen/X86/seh-catch-all-win32.ll
index e8da7ab971b1..5ecf37e5248c 100644
--- a/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
; 32-bit catch-all has to use a filter function because that's how it saves the
; exception code.
@@ -75,13 +75,13 @@ entry:
; CHECK: movl -24(%ebp), %esp
; EH state -1
; CHECK: movl [[code_offs]](%ebp), %[[code:[a-z]+]]
-; CHECK-DAG: movl %[[code]], 4(%esp)
-; CHECK-DAG: movl $_str, (%esp)
+; CHECK: pushl %[[code]]
+; CHECK: pushl $_str
; CHECK: calll _printf
; CHECK: .section .xdata,"dr"
; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK: L__ehtable$main
; CHECK-NEXT: .long -1
; CHECK-NEXT: .long _filt$main
diff --git a/test/CodeGen/X86/seh-safe-div-win32.ll b/test/CodeGen/X86/seh-safe-div-win32.ll
index 643af3a472fb..3d46e3867625 100644
--- a/test/CodeGen/X86/seh-safe-div-win32.ll
+++ b/test/CodeGen/X86/seh-safe-div-win32.ll
@@ -65,13 +65,13 @@ __try.cont:
; Landing pad code
-; CHECK: [[handler0:LBB0_[0-9]+]]: # %handler0
+; CHECK: [[handler1:LBB0_[0-9]+]]: # %handler1
; Restore SP
; CHECK: movl {{.*}}(%ebp), %esp
; CHECK: calll _puts
; CHECK: jmp [[cont_bb]]
-; CHECK: [[handler1:LBB0_[0-9]+]]: # %handler1
+; CHECK: [[handler0:LBB0_[0-9]+]]: # %handler0
; Restore SP
; CHECK: movl {{.*}}(%ebp), %esp
; CHECK: calll _puts
diff --git a/test/CodeGen/X86/seh-safe-div.ll b/test/CodeGen/X86/seh-safe-div.ll
index 60918cf07058..d46e235c59ac 100644
--- a/test/CodeGen/X86/seh-safe-div.ll
+++ b/test/CodeGen/X86/seh-safe-div.ll
@@ -67,14 +67,14 @@ __try.cont:
; Landing pad code
-; CHECK: [[handler0:\.LBB0_[0-9]+]]: # %handler0
+; CHECK: [[handler1:\.LBB0_[0-9]+]]: # %handler1
; CHECK: callq puts
-; CHECK: movl $-1, [[rloc]]
+; CHECK: movl $-2, [[rloc]]
; CHECK: jmp [[cont_bb]]
-; CHECK: [[handler1:\.LBB0_[0-9]+]]: # %handler1
+; CHECK: [[handler0:\.LBB0_[0-9]+]]: # %handler0
; CHECK: callq puts
-; CHECK: movl $-2, [[rloc]]
+; CHECK: movl $-1, [[rloc]]
; CHECK: jmp [[cont_bb]]
; CHECK: .seh_handlerdata
diff --git a/test/CodeGen/X86/seh-stack-realign.ll b/test/CodeGen/X86/seh-stack-realign.ll
index 654cad347f6b..1225faebdb83 100644
--- a/test/CodeGen/X86/seh-stack-realign.ll
+++ b/test/CodeGen/X86/seh-stack-realign.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
; 32-bit catch-all has to use a filter function because that's how it saves the
; exception code.
@@ -57,19 +57,19 @@ entry:
; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%esi)
; CHECK: movl $L__ehtable$main,
; EH state 0
-; CHECK: movl $0, 40(%esi)
+; CHECK: movl $0, 32(%esi)
; CHECK: calll _crash
; CHECK: retl
; CHECK: LBB0_[[lpbb:[0-9]+]]: # %__except
; Restore ESP
; CHECK: movl -24(%ebp), %esp
; Restore ESI
-; CHECK: leal -44(%ebp), %esi
+; CHECK: leal -36(%ebp), %esi
; Restore EBP
-; CHECK: movl 12(%esi), %ebp
+; CHECK: movl 4(%esi), %ebp
; CHECK: movl [[code_offs]](%esi), %[[code:[a-z]+]]
-; CHECK-DAG: movl %[[code]], 4(%esp)
-; CHECK-DAG: movl $_str, (%esp)
+; CHECK: pushl %[[code]]
+; CHECK: pushl $_str
; CHECK: calll _printf
; CHECK: .section .xdata,"dr"
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 91b42bd67767..10658f3fa4ef 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -33,7 +33,7 @@ entry:
define void @pr26232(i64 %a) {
; KNL-32-LABEL: pr26232:
-; KNL-32: # BB#0: # %for_test11.preheader
+; KNL-32: # BB#0: # %for_loop599.preheader
; KNL-32-NEXT: pushl %esi
; KNL-32-NEXT: .Ltmp0:
; KNL-32-NEXT: .cfi_def_cfa_offset 8
@@ -42,7 +42,7 @@ define void @pr26232(i64 %a) {
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; KNL-32-NEXT: movw $-1, %dx
-; KNL-32-NEXT: .align 16, 0x90
+; KNL-32-NEXT: .p2align 4, 0x90
; KNL-32-NEXT: .LBB1_1: # %for_loop599
; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1
; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000
diff --git a/test/CodeGen/X86/setcc-narrowing.ll b/test/CodeGen/X86/setcc-narrowing.ll
index bf5b45031a24..a4259ddd2318 100644
--- a/test/CodeGen/X86/setcc-narrowing.ll
+++ b/test/CodeGen/X86/setcc-narrowing.ll
@@ -6,9 +6,9 @@
define i32 @t1() nounwind ssp {
entry:
; CHECK-LABEL: t1:
-; CHECK: cmpl $0, _t1.global
+; CHECK: xorl %eax, %eax
+; CHECK-NEXT: cmpl $0, _t1.global
; CHECK-NEXT: setne %al
-; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: ret
%0 = load i64, i64* @t1.global, align 8
%and = and i64 4294967295, %0
diff --git a/test/CodeGen/X86/setcc.ll b/test/CodeGen/X86/setcc.ll
index b4847c54ffaf..eabcda4e075f 100644
--- a/test/CodeGen/X86/setcc.ll
+++ b/test/CodeGen/X86/setcc.ll
@@ -7,8 +7,8 @@
define zeroext i16 @t1(i16 zeroext %x) nounwind readnone ssp {
entry:
; CHECK-LABEL: t1:
+; CHECK: xorl %eax, %eax
; CHECK: seta %al
-; CHECK: movzbl %al, %eax
; CHECK: shll $5, %eax
%0 = icmp ugt i16 %x, 26 ; <i1> [#uses=1]
%iftmp.1.0 = select i1 %0, i16 32, i16 0 ; <i16> [#uses=1]
@@ -54,3 +54,27 @@ entry:
%add = shl nuw nsw i32 %conv4.2, 16
ret i32 %add
}
+
+define i8 @t5(i32 %a) #0 {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: testl %edi, %edi
+; CHECK: setns %al
+ %.lobit = lshr i32 %a, 31
+ %trunc = trunc i32 %.lobit to i8
+ %.not = xor i8 %trunc, 1
+ ret i8 %.not
+}
+
+define zeroext i1 @t6(i32 %a) #0 {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: testl %edi, %edi
+; CHECK: setns %al
+ %.lobit = lshr i32 %a, 31
+ %trunc = trunc i32 %.lobit to i1
+ %.not = xor i1 %trunc, 1
+ ret i1 %.not
+}
+
+attributes #0 = { "target-cpu"="skylake-avx512" }
diff --git a/test/CodeGen/X86/sext-ret-val.ll b/test/CodeGen/X86/sext-ret-val.ll
index da1a1871e7e8..33de80f02494 100644
--- a/test/CodeGen/X86/sext-ret-val.ll
+++ b/test/CodeGen/X86/sext-ret-val.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep movzbl | count 1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
; rdar://6699246
define signext i8 @t1(i8* %A) nounwind readnone ssp {
@@ -6,6 +6,11 @@ entry:
%0 = icmp ne i8* %A, null
%1 = zext i1 %0 to i8
ret i8 %1
+
+; CHECK-LABEL: t1:
+; CHECK: cmpl
+; CHECK-NEXT: setne
+; CHECK-NEXT: retl
}
define i8 @t2(i8* %A) nounwind readnone ssp {
@@ -13,4 +18,9 @@ entry:
%0 = icmp ne i8* %A, null
%1 = zext i1 %0 to i8
ret i8 %1
+
+; CHECK-LABEL: t2:
+; CHECK: cmpl
+; CHECK-NEXT: setne
+; CHECK-NEXT: retl
}
diff --git a/test/CodeGen/X86/sext-setcc-self.ll b/test/CodeGen/X86/sext-setcc-self.ll
index 23d66a24724d..e739d21e64e0 100644
--- a/test/CodeGen/X86/sext-setcc-self.ll
+++ b/test/CodeGen/X86/sext-setcc-self.ll
@@ -1,55 +1,68 @@
-; RUN: llc -march=x86-64 -mcpu=nehalem -asm-verbose=false < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
define <4 x i32> @test_ueq(<4 x float> %in) {
-entry:
- ; CHECK: pcmpeqd %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp ueq <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_ueq:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp ueq <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_uge(<4 x float> %in) {
-entry:
- ; CHECK: pcmpeqd %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp uge <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_uge:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp uge <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_ule(<4 x float> %in) {
-entry:
- ; CHECK: pcmpeqd %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp ule <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_ule:
+; CHECK: # BB#0:
+; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp ule <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_one(<4 x float> %in) {
-entry:
- ; CHECK: xorps %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp one <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_one:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp one <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_ogt(<4 x float> %in) {
-entry:
- ; CHECK: xorps %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp ogt <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_ogt:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp ogt <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
define <4 x i32> @test_olt(<4 x float> %in) {
-entry:
- ; CHECK: xorps %xmm0, %xmm0
- ; CHECK-NEXT: ret
- %0 = fcmp olt <4 x float> %in, %in
- %1 = sext <4 x i1> %0 to <4 x i32>
- ret <4 x i32> %1
+; CHECK-LABEL: test_olt:
+; CHECK: # BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+;
+ %t0 = fcmp olt <4 x float> %in, %in
+ %t1 = sext <4 x i1> %t0 to <4 x i32>
+ ret <4 x i32> %t1
}
diff --git a/test/CodeGen/X86/sext-trunc.ll b/test/CodeGen/X86/sext-trunc.ll
index 22b3791ba578..5c59bc00860e 100644
--- a/test/CodeGen/X86/sext-trunc.ll
+++ b/test/CodeGen/X86/sext-trunc.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=x86 > %t
-; RUN: grep movsbl %t
-; RUN: not grep movz %t
-; RUN: not grep and %t
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
-define signext i8 @foo(i16 signext %x) nounwind {
+define signext i8 @foo(i16 signext %x) nounwind {
%retval56 = trunc i16 %x to i8
ret i8 %retval56
+
+; CHECK-LABEL: foo:
+; CHECK: movb
+; CHECK-NEXT: retl
}
diff --git a/test/CodeGen/X86/shift-pcmp.ll b/test/CodeGen/X86/shift-pcmp.ll
index 365c7310559b..4945d6115dbe 100644
--- a/test/CodeGen/X86/shift-pcmp.ll
+++ b/test/CodeGen/X86/shift-pcmp.ll
@@ -1,18 +1,20 @@
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -o - -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -o - -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
define <8 x i16> @foo(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-NEXT: .short 32
-; CHECK-LABEL: {{^_?foo:}}
-; CHECK-NOT: psll
-entry:
+; SSE-LABEL: foo:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: foo:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%icmp = icmp eq <8 x i16> %a, %b
%zext = zext <8 x i1> %icmp to <8 x i16>
%shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
@@ -21,10 +23,23 @@ entry:
; Don't fail with an assert due to an undef in the buildvector
define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: bar
-entry:
+; SSE-LABEL: bar:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
+; SSE-NEXT: psllw $5, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: bar:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
%icmp = icmp eq <8 x i16> %a, %b
%zext = zext <8 x i1> %icmp to <8 x i16>
%shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 undef, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
ret <8 x i16> %shl
}
+
diff --git a/test/CodeGen/X86/shrink-wrap-chkstk.ll b/test/CodeGen/X86/shrink-wrap-chkstk.ll
index aecae89aee56..099ef137d8d9 100644
--- a/test/CodeGen/X86/shrink-wrap-chkstk.ll
+++ b/test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -7,7 +7,7 @@
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
target triple = "i686-pc-windows-msvc18.0.0"
-%struct.S = type { [12 x i8] }
+%struct.S = type { [8192 x i8] }
define x86_thiscallcc void @call_inalloca(i1 %x) {
entry:
@@ -29,7 +29,7 @@ bb2:
; CHECK-LABEL: _call_inalloca: # @call_inalloca
; CHECK: pushl %ebp
; CHECK: movl %esp, %ebp
-; CHECK: movl $12, %eax
+; CHECK: movl $8192, %eax
; CHECK: calll __chkstk
; CHECK: calll _inalloca_params
; CHECK: movl %ebp, %esp
@@ -64,9 +64,9 @@ false:
; CHECK: cmpl %edx, %eax
; CHECK: jge LBB1_2
; CHECK: pushl %eax
-; CHECK: movl $4100, %eax
+; CHECK: movl $4092, %eax
; CHECK: calll __chkstk
-; CHECK: movl 4100(%esp), %eax
+; CHECK: movl 4092(%esp), %eax
; CHECK: calll _doSomething
; CHECK: LBB1_2:
; CHECK: retl
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
new file mode 100644
index 000000000000..58b4e986f774
--- /dev/null
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -0,0 +1,865 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
+
+@c = external global i32*, align 8
+
+; %val1 = load <2 x i8>
+; %op1 = zext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm1
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+ %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+ %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <4 x i8>
+; %op1 = zext<4 x i32> %val1
+; %val2 = load <4 x i8>
+; %op2 = zext<4 x i32> %val2
+; %rst = mul <4 x i32> %op1, %op2
+;
+define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_4xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
+ %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
+ %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
+ %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
+ %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
+ %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+ store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <8 x i8>
+; %op1 = zext<8 x i32> %val1
+; %val2 = load <8 x i8>
+; %op2 = zext<8 x i32> %val2
+; %rst = mul <8 x i32> %op1, %op2
+;
+define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_8xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
+ %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
+ %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
+ %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
+ %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
+ %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
+ store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <16 x i8>
+; %op1 = zext<16 x i32> %val1
+; %val2 = load <16 x i8>
+; %op2 = zext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_16xi8:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; CHECK-NEXT: movdqa %xmm1, %xmm4
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; CHECK-NEXT: pmullw %xmm3, %xmm4
+; CHECK-NEXT: movdqa %xmm4, %xmm3
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
+ %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
+ %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
+ %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
+ %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
+ %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+ store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = zext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhuw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+ %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+ %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <4 x i16>
+; %op1 = zext<4 x i32> %val1
+; %val2 = load <4 x i16>
+; %op2 = zext<4 x i32> %val2
+; %rst = mul <4 x i32> %op1, %op2
+;
+define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_4xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhuw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
+ %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
+ %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
+ %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
+ %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
+ %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
+ store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <8 x i16>
+; %op1 = zext<8 x i32> %val1
+; %val2 = load <8 x i16>
+; %op2 = zext<8 x i32> %val2
+; %rst = mul <8 x i32> %op1, %op2
+;
+define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_8xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhuw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
+ %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
+ %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
+ %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
+ %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
+ %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
+ store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <16 x i16>
+; %op1 = zext<16 x i32> %val1
+; %val2 = load <16 x i16>
+; %op2 = zext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_16xi16:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
+; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; CHECK-NEXT: movdqa %xmm2, %xmm4
+; CHECK-NEXT: pmulhuw %xmm0, %xmm4
+; CHECK-NEXT: pmullw %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; CHECK-NEXT: movdqa %xmm3, %xmm4
+; CHECK-NEXT: pmulhuw %xmm1, %xmm4
+; CHECK-NEXT: pmullw %xmm1, %xmm3
+; CHECK-NEXT: movdqa %xmm3, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
+ %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
+ %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
+ %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
+ %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
+ %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+ store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i8>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = sext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi8_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm1
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+ %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+ %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i8>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i8>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi8_sext_zext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
+; CHECK-NEXT: movd %ecx, %xmm1
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+ %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+ %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = sext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi16_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmulhw %xmm0, %xmm2
+; CHECK-NEXT: pmullw %xmm0, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+ %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+ %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <2 x i16>
+; %op1 = sext<2 x i32> %val1
+; %val2 = load <2 x i16>
+; %op2 = zext<2 x i32> %val2
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_2xi16_sext_zext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pmuludq %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: psrlq $32, %xmm3
+; CHECK-NEXT: pmuludq %xmm1, %xmm3
+; CHECK-NEXT: psllq $32, %xmm3
+; CHECK-NEXT: psrlq $32, %xmm1
+; CHECK-NEXT: pmuludq %xmm0, %xmm1
+; CHECK-NEXT: psllq $32, %xmm1
+; CHECK-NEXT: paddq %xmm3, %xmm1
+; CHECK-NEXT: paddq %xmm2, %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
+ %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
+ %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val1 = load <16 x i16>
+; %op1 = sext<16 x i32> %val1
+; %val2 = load <16 x i16>
+; %op2 = sext<16 x i32> %val2
+; %rst = mul <16 x i32> %op1, %op2
+;
+define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+; CHECK-LABEL: mul_16xi16_sext:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
+; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
+; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; CHECK-NEXT: movdqa %xmm2, %xmm4
+; CHECK-NEXT: pmulhw %xmm0, %xmm4
+; CHECK-NEXT: pmullw %xmm0, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; CHECK-NEXT: movdqa %xmm3, %xmm4
+; CHECK-NEXT: pmulhw %xmm1, %xmm4
+; CHECK-NEXT: pmullw %xmm1, %xmm3
+; CHECK-NEXT: movdqa %xmm3, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
+ %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
+ %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
+ %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+ %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
+ %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
+ %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
+ %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
+ store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst5:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i8>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi8_varconst6:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-NEXT: psraw $8, %xmm0
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+ %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+ %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst1:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhuw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst2:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmulhw %xmm1, %xmm2
+; CHECK-NEXT: pmullw %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = zext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000
+; CHECK-NEXT: movd %rcx, %xmm1
+; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-NEXT: psrlq $32, %xmm0
+; CHECK-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-NEXT: psllq $32, %xmm0
+; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
+
+; %val = load <2 x i16>
+; %op1 = sext<2 x i32> %val
+; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
+; %rst = mul <2 x i32> %op1, %op2
+;
+define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
+; CHECK-LABEL: mul_2xi16_varconst4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: psrad $16, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
+; CHECK-NEXT: movd %rcx, %xmm1
+; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pmuludq %xmm1, %xmm2
+; CHECK-NEXT: psrlq $32, %xmm0
+; CHECK-NEXT: pmuludq %xmm1, %xmm0
+; CHECK-NEXT: psllq $32, %xmm0
+; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
+; CHECK-NEXT: retq
+entry:
+ %pre = load i32*, i32** @c
+ %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+ %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
+ %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
+ %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
+ %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
+ %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+ %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+ store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/sibcall-5.ll b/test/CodeGen/X86/sibcall-5.ll
index aab028bd17c8..4901b4fa069c 100644
--- a/test/CodeGen/X86/sibcall-5.ll
+++ b/test/CodeGen/X86/sibcall-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin8 -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-apple-darwin9 -mattr=+sse2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-sse3 | FileCheck %s --check-prefix=X64_BAD
@@ -8,7 +8,7 @@
define double @foo(double %a) nounwind readonly ssp {
entry:
; X32-LABEL: foo:
-; X32: jmp L_sin$stub
+; X32: jmp _sin
; X64-LABEL: foo:
; X64: jmp _sin
@@ -18,7 +18,7 @@ entry:
define float @bar(float %a) nounwind readonly ssp {
; X32-LABEL: bar:
-; X32: jmp L_sinf$stub
+; X32: jmp _sinf
; X64-LABEL: bar:
; X64: jmp _sinf
@@ -27,10 +27,6 @@ entry:
ret float %0
}
-; X32-LABEL: L_sin$stub:
-; X32-NEXT: .indirect_symbol _sin
-; X32-LABEL: L_sinf$stub:
-; X32-NEXT: .indirect_symbol _sinf
declare float @sinf(float) nounwind readonly
diff --git a/test/CodeGen/X86/sibcall-byval.ll b/test/CodeGen/X86/sibcall-byval.ll
index c335f30a93a2..8f5833adf5a3 100644
--- a/test/CodeGen/X86/sibcall-byval.ll
+++ b/test/CodeGen/X86/sibcall-byval.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=32
+; RUN: llc < %s -mtriple=i386-apple-darwin9 | FileCheck %s -check-prefix=32
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=64
%struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
@@ -6,7 +6,7 @@
define i32 @f(%struct.p* byval align 4 %q) nounwind ssp {
entry:
; 32: _f:
-; 32: jmp L_g$stub
+; 32: jmp _g
; 64: _f:
; 64: jmp _g
@@ -19,7 +19,7 @@ declare i32 @g(%struct.p* byval align 4)
define i32 @h(%struct.p* byval align 4 %q, i32 %r) nounwind ssp {
entry:
; 32: _h:
-; 32: jmp L_i$stub
+; 32: jmp _i
; 64: _h:
; 64: jmp _i
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index 9d02bcd9a6c7..f0dff3b806c5 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -1,6 +1,8 @@
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_NOOPT
; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNUX32_SINCOS
; Combine sin / cos into a single call.
; rdar://13087969
@@ -13,6 +15,15 @@ entry:
; GNU_SINCOS: movss 4(%rsp), %xmm0
; GNU_SINCOS: addss (%rsp), %xmm0
+; GNUX32_SINCOS-LABEL: test1:
+; GNUX32_SINCOS: callq sincosf
+; GNUX32_SINCOS: movss 4(%esp), %xmm0
+; GNUX32_SINCOS: addss (%esp), %xmm0
+
+; GNU_NOOPT: test1
+; GNU_NOOPT: callq sinf
+; GNU_NOOPT: callq cosf
+
; OSX_SINCOS-LABEL: test1:
; OSX_SINCOS: callq ___sincosf_stret
; OSX_SINCOS: movshdup {{.*}} xmm1 = xmm0[1,1,3,3]
@@ -34,6 +45,15 @@ entry:
; GNU_SINCOS: movsd 16(%rsp), %xmm0
; GNU_SINCOS: addsd 8(%rsp), %xmm0
+; GNUX32_SINCOS-LABEL: test2:
+; GNUX32_SINCOS: callq sincos
+; GNUX32_SINCOS: movsd 16(%esp), %xmm0
+; GNUX32_SINCOS: addsd 8(%esp), %xmm0
+
+; GNU_NOOPT: test2:
+; GNU_NOOPT: callq sin
+; GNU_NOOPT: callq cos
+
; OSX_SINCOS-LABEL: test2:
; OSX_SINCOS: callq ___sincos_stret
; OSX_SINCOS: addsd %xmm1, %xmm0
@@ -53,6 +73,16 @@ entry:
; GNU_SINCOS: callq sinl
; GNU_SINCOS: callq cosl
; GNU_SINCOS: ret
+
+; GNUX32_SINCOS-LABEL: test3:
+; GNUX32_SINCOS: callq sinl
+; GNUX32_SINCOS: callq cosl
+; GNUX32_SINCOS: ret
+
+; GNU_NOOPT: test3:
+; GNU_NOOPT: callq sinl
+; GNU_NOOPT: callq cosl
+
%call = tail call x86_fp80 @sinl(x86_fp80 %x) nounwind
%call1 = tail call x86_fp80 @cosl(x86_fp80 %x) nounwind
%add = fadd x86_fp80 %call, %call1
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
index c2f0411901a7..5436cf248bd5 100644
--- a/test/CodeGen/X86/sink-blockfreq.ll
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -1,5 +1,5 @@
-; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
-; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
+; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
+; RUN: llc -disable-preheader-prot=true -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
; Test that by changing BlockFrequencyInfo we change the order in which
; machine-sink looks for sucessor blocks. By not using BFI, both G and B
diff --git a/test/CodeGen/X86/sink-cheap-instructions.ll b/test/CodeGen/X86/sink-cheap-instructions.ll
index 9b9a6865af93..8966ca50142e 100644
--- a/test/CodeGen/X86/sink-cheap-instructions.ll
+++ b/test/CodeGen/X86/sink-cheap-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux -sink-insts-to-avoid-spills | FileCheck %s -check-prefix=SINK
; Ensure that we sink copy-like instructions into loops to avoid register
diff --git a/test/CodeGen/X86/sjlj-eh.ll b/test/CodeGen/X86/sjlj-eh.ll
new file mode 100644
index 000000000000..4d2e4e821f42
--- /dev/null
+++ b/test/CodeGen/X86/sjlj-eh.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s
+
+declare void @_Z20function_that_throwsv()
+declare i32 @__gxx_personality_sj0(...)
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+
+define void @_Z8functionv() personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
+entry:
+ invoke void @_Z20function_that_throwsv()
+ to label %try.cont unwind label %lpad
+
+lpad:
+ %0 = landingpad { i8*, i32 }
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ %2 = tail call i8* @__cxa_begin_catch(i8* %1)
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont:
+ ret void
+}
+
+; struct _Unwind_FunctionContext {
+; +00 struct _Unwind_FunctionContext *prev; -64(%ebp)
+; +04 uintptr_t __callsite; -60(%ebp)
+; +08 uintptr_t __buffer[4]; -44(%ebp)
+; +28 __personality_routine __personality; -40(%ebp)
+; +32 uintptr_t __lsda; -36(%ebp)
+; +36 void *__jbuf[]; -32(%ebp)
+; };
+
+
+; CHECK-LABEL: __Z8functionv:
+; struct _Unwind_FunctionContext UFC;
+;
+; UFC.__personality = __gxx_personality_sj0
+; CHECK: movl $___gxx_personality_sj0, -40(%ebp)
+; UFC.__lsda = $LSDA
+; CHECK: movl $[[LSDA:GCC_except_table[0-9]+]], -36(%ebp)
+; UFC.__jbuf[0] = $EBP
+; CHECK: movl %ebp, -32(%ebp)
+; UFC.__jbuf[2] = $ESP
+; CHECK: movl %esp, -24(%ebp)
+; UFC.__jbuf[1] = $EIP
+; CHECK: movl $[[RESUME:LBB[0-9]+_[0-9]+]], -28(%ebp)
+; UFC.__callsite = 1
+; CHECK: movl $1, -60(%ebp)
+; _Unwind_SjLj_Register(&UFC);
+; CHECK: leal -64(%ebp), %eax
+; CHECK: pushl %eax
+; CHECK: calll __Unwind_SjLj_Register
+; CHECK: addl $4, %esp
+; function_that_throws();
+; CHECK: calll __Z20function_that_throwsv
+; _Unwind_SjLj_Unregister(&UFC);
+; CHECK: leal -64(%ebp), %eax
+; CHECK: calll __Unwind_SjLj_Unregister
+;
+; CHECK: [[RESUME]]:
+; CHECK: leal -64(%ebp), %esi
+; assert(UFC.__callsite <= 1);
+; CHECK: movl -60(%ebp), %eax
+; CHECK: cmpl $1, %eax
+; CHECK: jbe [[CONT:LBB[0-9]+_[0-9]+]]
+; CHECK: ud2
+; CHECK: [[CONT]]:
+; *Handlers[--UFC.__callsite]
+; CHECK: subl $1, %eax
+; CHECK: jmpl *LJTI
+
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll
index 27cbef681b7e..41e9a95bcdd8 100644
--- a/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -14,15 +14,15 @@
; Intel chips with fast unaligned memory accesses
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl 2>&1 | FileCheck %s --check-prefix=FAST
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=skylake 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl 2>&1 | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=skylake-avx512 2>&1 | FileCheck %s --check-prefix=FAST
; AMD chips with slow unaligned memory accesses
diff --git a/test/CodeGen/X86/sqrt-fastmath-mir.ll b/test/CodeGen/X86/sqrt-fastmath-mir.ll
new file mode 100644
index 000000000000..750b4d96e5d0
--- /dev/null
+++ b/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -recip=sqrt:2 -stop-after=expand-isel-pseudos 2>&1 | FileCheck %s
+
+declare float @llvm.sqrt.f32(float) #0
+
+define float @foo(float %f) #0 {
+; CHECK: {{name: *foo}}
+; CHECK: body:
+; CHECK: %0 = COPY %xmm0
+; CHECK: %1 = VRSQRTSSr killed %2, %0
+; CHECK: %3 = VMULSSrr %0, %1
+; CHECK: %4 = VMOVSSrm
+; CHECK: %5 = VFMADDSSr213r %1, killed %3, %4
+; CHECK: %6 = VMOVSSrm
+; CHECK: %7 = VMULSSrr %1, %6
+; CHECK: %8 = VMULSSrr killed %7, killed %5
+; CHECK: %9 = VMULSSrr %0, %8
+; CHECK: %10 = VFMADDSSr213r %8, %9, %4
+; CHECK: %11 = VMULSSrr %9, %6
+; CHECK: %12 = VMULSSrr killed %11, killed %10
+; CHECK: %13 = FsFLD0SS
+; CHECK: %14 = VCMPSSrr %0, killed %13, 0
+; CHECK: %15 = VFsANDNPSrr killed %14, killed %12
+; CHECK: %xmm0 = COPY %15
+; CHECK: RET 0, %xmm0
+ %call = tail call float @llvm.sqrt.f32(float %f) #1
+ ret float %call
+}
+
+define float @rfoo(float %f) #0 {
+; CHECK: {{name: *rfoo}}
+; CHECK: body: |
+; CHECK: %0 = COPY %xmm0
+; CHECK: %1 = VRSQRTSSr killed %2, %0
+; CHECK: %3 = VMULSSrr %0, %1
+; CHECK: %4 = VMOVSSrm
+; CHECK: %5 = VFMADDSSr213r %1, killed %3, %4
+; CHECK: %6 = VMOVSSrm
+; CHECK: %7 = VMULSSrr %1, %6
+; CHECK: %8 = VMULSSrr killed %7, killed %5
+; CHECK: %9 = VMULSSrr %0, %8
+; CHECK: %10 = VFMADDSSr213r %8, killed %9, %4
+; CHECK: %11 = VMULSSrr %8, %6
+; CHECK: %12 = VMULSSrr killed %11, killed %10
+; CHECK: %xmm0 = COPY %12
+; CHECK: RET 0, %xmm0
+ %sqrt = tail call float @llvm.sqrt.f32(float %f)
+ %div = fdiv fast float 1.0, %sqrt
+ ret float %div
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 386409a674ef..1c6b13026a72 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -34,12 +34,11 @@ define float @ff(float %f) #0 {
; ESTIMATE-LABEL: ff:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm3
-; ESTIMATE-NEXT: vmulss %xmm3, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm0, %xmm2
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm2, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm2, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
; ESTIMATE-NEXT: vxorps %xmm2, %xmm2, %xmm2
; ESTIMATE-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
; ESTIMATE-NEXT: vandnps %xmm1, %xmm0, %xmm0
@@ -78,11 +77,11 @@ define float @reciprocal_square_root(float %x) #0 {
; ESTIMATE-LABEL: reciprocal_square_root:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm2
; ESTIMATE-NEXT: vmulss %xmm2, %xmm0, %xmm0
+; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: retq
%sqrt = tail call float @llvm.sqrt.f32(float %x)
%div = fdiv fast float 1.0, %sqrt
@@ -100,11 +99,11 @@ define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtps %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
+; ESTIMATE-NEXT: vmulps %xmm1, %xmm1, %xmm2
+; ESTIMATE-NEXT: vmulps %xmm2, %xmm0, %xmm0
; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: retq
%sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
@@ -125,11 +124,11 @@ define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtps %ymm0, %ymm1
-; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
+; ESTIMATE-NEXT: vmulps %ymm1, %ymm1, %ymm2
+; ESTIMATE-NEXT: vmulps %ymm2, %ymm0, %ymm0
; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
-; ESTIMATE-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
; ESTIMATE-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
index b96ecc575021..1d5a88a1a5ec 100644
--- a/test/CodeGen/X86/sse-intel-ocl.ll
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -14,7 +14,7 @@ declare <16 x float> @func_float16(<16 x float>, <16 x float>)
; WIN64: ret
; WIN32: testf16_inp
-; WIN32: movl %eax, (%esp)
+; WIN32: pushl %eax
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
; WIN32: addps {{.*}}, {{%xmm[0-3]}}
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..2102b4211153
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
+
+define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
+; X64-LABEL: test_mm_cvtsi64_ss:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2ssq %rdi, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cvt = sitofp i64 %a1 to float
+ %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
+define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
+; X64-LABEL: test_mm_cvtss_si64:
+; X64: # BB#0:
+; X64-NEXT: cvtss2si %xmm0, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
+; X64-LABEL: test_mm_cvttss_si64:
+; X64: # BB#0:
+; X64-NEXT: cvttss2si %xmm0, %rax
+; X64-NEXT: retq
+ %cvt = extractelement <4 x float> %a0, i32 0
+ %res = fptosi float %cvt to i64
+ ret i64 %res
+}
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..090ddfdfa93a
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -0,0 +1,2303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
+
+define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_add_ps:
+; X32: # BB#0:
+; X32-NEXT: addps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_ps:
+; X64: # BB#0:
+; X64-NEXT: addps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fadd <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_add_ss:
+; X32: # BB#0:
+; X32-NEXT: addss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_ss:
+; X64: # BB#0:
+; X64-NEXT: addss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fadd = fadd float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fadd, i32 0
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_and_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_and_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: andl %eax, %edx
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: andl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %r8d, %edi
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %eax, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %res = and <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: notl %edx
+; X32-NEXT: notl %ecx
+; X32-NEXT: notl %esi
+; X32-NEXT: notl %eax
+; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_andnot_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdx
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: notl %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: notl %ecx
+; X64-NEXT: andl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: notl %esi
+; X64-NEXT: notl %edx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %r8d, %edx
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: andl %edi, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = and <4 x i32> %not, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpeqps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpeqps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp oeq <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpeqss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpeqss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpleps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpleps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ole <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpless %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpless %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpltps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpltps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp olt <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpltss %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpltss %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpleps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpleps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ole <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpless %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpless %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpltps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpltps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp olt <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpltss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpltss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpneqps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpneqps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp une <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpneqss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpneqss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnleps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnleps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ugt <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnless %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnless %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnltps %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnltps %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp uge <4 x float> %a1, %a0
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnltss %xmm0, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnltss %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
+ %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnleps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnleps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ugt <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnless %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnless %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpnltps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpnltps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp uge <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpnltss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpnltss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpordps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpordps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp ord <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpordss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpordss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_ps:
+; X32: # BB#0:
+; X32-NEXT: cmpunordps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_ps:
+; X64: # BB#0:
+; X64-NEXT: cmpunordps %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = fcmp uno <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_ss:
+; X32: # BB#0:
+; X32-NEXT: cmpunordss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_ss:
+; X64: # BB#0:
+; X64-NEXT: cmpunordss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
+ ret <4 x float> %res
+}
+
+define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comieq_ss:
+; X32: # BB#0:
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comieq_ss:
+; X64: # BB#0:
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comige_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comige_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comigt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comigt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comile_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comile_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comilt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comiss %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comilt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comiss %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_comineq_ss:
+; X32: # BB#0:
+; X32-NEXT: comiss %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comineq_ss:
+; X64: # BB#0:
+; X64-NEXT: comiss %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvt_ss2si:
+; X32: # BB#0:
+; X32-NEXT: cvtss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvt_ss2si:
+; X64: # BB#0:
+; X64-NEXT: cvtss2si %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm_cvtsi32_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cvtsi2ssl %eax, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi32_ss:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2ssl %edi, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %cvt = sitofp i32 %a1 to float
+ %res = insertelement <4 x float> %a0, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
+define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtss_f32:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtss_f32:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = extractelement <4 x float> %a0, i32 0
+ ret float %res
+}
+
+define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtss_si32:
+; X32: # BB#0:
+; X32-NEXT: cvtss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtss_si32:
+; X64: # BB#0:
+; X64-NEXT: cvtss2si %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
+ ret i32 %res
+}
+
+define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttss_si:
+; X32: # BB#0:
+; X32-NEXT: cvttss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttss_si:
+; X64: # BB#0:
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: retq
+ %cvt = extractelement <4 x float> %a0, i32 0
+ %res = fptosi float %cvt to i32
+ ret i32 %res
+}
+
+define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttss_si32:
+; X32: # BB#0:
+; X32-NEXT: cvttss2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttss_si32:
+; X64: # BB#0:
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: retq
+ %cvt = extractelement <4 x float> %a0, i32 0
+ %res = fptosi float %cvt to i32
+ ret i32 %res
+}
+
+define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_div_ps:
+; X32: # BB#0:
+; X32-NEXT: divps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_ps:
+; X64: # BB#0:
+; X64-NEXT: divps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fdiv <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_div_ss:
+; X32: # BB#0:
+; X32-NEXT: divss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_ss:
+; X64: # BB#0:
+; X64-NEXT: divss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fdiv = fdiv float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fdiv, i32 0
+ ret <4 x float> %res
+}
+
+define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
+; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $8064, %eax # imm = 0x1F80
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $8064, %eax # imm = 0x1F80
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 8064
+ ret i32 %4
+}
+declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
+
+define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
+; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $63, %eax
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $63, %eax
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 63
+ ret i32 %4
+}
+
+define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
+; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $32768, %eax # imm = 0x8000
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $32768, %eax # imm = 0x8000
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 32768
+ ret i32 %4
+}
+
+define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
+; X32-LABEL: test_MM_GET_ROUNDING_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: andl $24576, %eax # imm = 0x6000
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_GET_ROUNDING_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: andl $24576, %eax # imm = 0x6000
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ %4 = and i32 %3, 24576
+ ret i32 %4
+}
+
+define i32 @test_mm_getcsr() nounwind {
+; X32-LABEL: test_mm_getcsr:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: leal (%esp), %eax
+; X32-NEXT: stmxcsr (%eax)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_getcsr:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1, align 4
+ ret i32 %3
+}
+
+define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %res = load <4 x float>, <4 x float>* %arg0, align 16
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ps1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_ps1:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %ld = load float, float* %a0, align 4
+ %res0 = insertelement <4 x float> undef, float %ld, i32 0
+ %res1 = insertelement <4 x float> %res0, float %ld, i32 1
+ %res2 = insertelement <4 x float> %res1, float %ld, i32 2
+ %res3 = insertelement <4 x float> %res2, float %ld, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
+; X32-LABEL: test_mm_load_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_ss:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: retq
+ %ld = load float, float* %a0, align 1
+ %res0 = insertelement <4 x float> undef, float %ld, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
+ %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_load1_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load1_ps:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %ld = load float, float* %a0, align 4
+ %res0 = insertelement <4 x float> undef, float %ld, i32 0
+ %res1 = insertelement <4 x float> %res0, float %ld, i32 1
+ %res2 = insertelement <4 x float> %res1, float %ld, i32 2
+ %res3 = insertelement <4 x float> %res2, float %ld, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
+; X32-LABEL: test_mm_loadh_pi:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadh_pi:
+; X64: # BB#0:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a1 to <2 x float>*
+ %ld = load <2 x float>, <2 x float>* %ptr
+ %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
+; X32-LABEL: test_mm_loadl_pi:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadl_pi:
+; X64: # BB#0:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: xorps %xmm2, %xmm2
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a1 to <2 x float>*
+ %ld = load <2 x float>, <2 x float>* %ptr
+ %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_loadr_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadr_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %ld = load <4 x float>, <4 x float>* %arg0, align 16
+ %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadu_ps:
+; X64: # BB#0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %res = load <4 x float>, <4 x float>* %arg0, align 1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_max_ps:
+; X32: # BB#0:
+; X32-NEXT: maxps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_ps:
+; X64: # BB#0:
+; X64-NEXT: maxps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_max_ss:
+; X32: # BB#0:
+; X32-NEXT: maxss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_ss:
+; X64: # BB#0:
+; X64-NEXT: maxss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_min_ps:
+; X32: # BB#0:
+; X32-NEXT: minps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_ps:
+; X64: # BB#0:
+; X64-NEXT: minps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_min_ss:
+; X32: # BB#0:
+; X32-NEXT: minss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_ss:
+; X64: # BB#0:
+; X64-NEXT: minss %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_move_ss:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_move_ss:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_movehl_ps:
+; X32: # BB#0:
+; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movehl_ps:
+; X64: # BB#0:
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_movelh_ps:
+; X32: # BB#0:
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movelh_ps:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %res
+}
+
+define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_ps:
+; X32: # BB#0:
+; X32-NEXT: movmskps %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movemask_ps:
+; X64: # BB#0:
+; X64-NEXT: movmskps %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_mul_ps:
+; X32: # BB#0:
+; X32-NEXT: mulps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_ps:
+; X64: # BB#0:
+; X64-NEXT: mulps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fmul <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_mul_ss:
+; X32: # BB#0:
+; X32-NEXT: mulss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_ss:
+; X64: # BB#0:
+; X64-NEXT: mulss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fmul = fmul float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fmul, i32 0
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_or_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_or_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: orl %eax, %edx
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: orl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: orl %r8d, %edi
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: orl %eax, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %res = or <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+define void @test_mm_prefetch(i8* %a0) {
+; X32-LABEL: test_mm_prefetch:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: prefetchnta (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_prefetch:
+; X64: # BB#0:
+; X64-NEXT: prefetchnta (%rdi)
+; X64-NEXT: retq
+ call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
+ ret void
+}
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
+
+define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_rcp_ps:
+; X32: # BB#0:
+; X32-NEXT: rcpps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rcp_ps:
+; X64: # BB#0:
+; X64-NEXT: rcpps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_rcp_ss:
+; X32: # BB#0:
+; X32-NEXT: rcpss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rcp_ss:
+; X64: # BB#0:
+; X64-NEXT: rcpss %xmm0, %xmm0
+; X64-NEXT: retq
+ %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
+ %ext0 = extractelement <4 x float> %rcp, i32 0
+ %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+ %ext1 = extractelement <4 x float> %a0, i32 1
+ %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+ %ext2 = extractelement <4 x float> %a0, i32 2
+ %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+ %ext3 = extractelement <4 x float> %a0, i32 3
+ %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+ ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_rsqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: rsqrtps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rsqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: rsqrtps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_rsqrt_ss:
+; X32: # BB#0:
+; X32-NEXT: rsqrtss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rsqrt_ss:
+; X64: # BB#0:
+; X64-NEXT: rsqrtss %xmm0, %xmm0
+; X64-NEXT: retq
+ %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
+ %ext0 = extractelement <4 x float> %rsqrt, i32 0
+ %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+ %ext1 = extractelement <4 x float> %a0, i32 1
+ %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+ %ext2 = extractelement <4 x float> %a0, i32 2
+ %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+ %ext3 = extractelement <4 x float> %a0, i32 3
+ %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+ ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-8065, %edx # imm = 0xE07F
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-8065, %ecx # imm = 0xE07F
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -8065
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
+
+define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-64, %edx
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-64, %ecx
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -64
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+
+define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-32769, %edx # imm = 0xFFFF7FFF
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-32769, %ecx # imm = 0xFFFF7FFF
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -32769
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+
+define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
+; X32-LABEL: test_mm_set_ps:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_ps:
+; X64: # BB#0:
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: movaps %xmm3, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a3, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a2, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a1, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a0, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
+; X32-LABEL: test_mm_set_ps1:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_ps1:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a0, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a0, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a0, i32 3
+ ret <4 x float> %res3
+}
+
+define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
+; X32-LABEL: test_MM_SET_ROUNDING_MODE:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: stmxcsr (%ecx)
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: andl $-24577, %edx # imm = 0x9FFF
+; X32-NEXT: orl %eax, %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_SET_ROUNDING_MODE:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: stmxcsr (%rax)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT: andl $-24577, %ecx # imm = 0x9FFF
+; X64-NEXT: orl %edi, %ecx
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %1 = alloca i32, align 4
+ %2 = bitcast i32* %1 to i8*
+ call void @llvm.x86.sse.stmxcsr(i8* %2)
+ %3 = load i32, i32* %1
+ %4 = and i32 %3, -24577
+ %5 = or i32 %4, %a0
+ store i32 %5, i32* %1
+ call void @llvm.x86.sse.ldmxcsr(i8* %2)
+ ret void
+}
+
+define <4 x float> @test_mm_set_ss(float %a0) nounwind {
+; X32-LABEL: test_mm_set_ss:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_ss:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
+ %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
+ %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
+; X32-LABEL: test_mm_set1_ps:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a0, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a0, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a0, i32 3
+ ret <4 x float> %res3
+}
+
+define void @test_mm_setcsr(i32 %a0) nounwind {
+; X32-LABEL: test_mm_setcsr:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: leal (%esp), %ecx
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: ldmxcsr (%ecx)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setcsr:
+; X64: # BB#0:
+; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: ldmxcsr (%rax)
+; X64-NEXT: retq
+ %st = alloca i32, align 4
+ store i32 %a0, i32* %st, align 4
+ %bc = bitcast i32* %st to i8*
+ call void @llvm.x86.sse.ldmxcsr(i8* %bc)
+ ret void
+}
+
+define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
+; X32-LABEL: test_mm_setr_ps:
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_ps:
+; X64: # BB#0:
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x float> undef, float %a0, i32 0
+ %res1 = insertelement <4 x float> %res0, float %a1, i32 1
+ %res2 = insertelement <4 x float> %res1, float %a2, i32 2
+ %res3 = insertelement <4 x float> %res2, float %a3, i32 3
+ ret <4 x float> %res3
+}
+
+define <4 x float> @test_mm_setzero_ps() {
+; X32-LABEL: test_mm_setzero_ps:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setzero_ps:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
+ ret <4 x float> zeroinitializer
+}
+
+define void @test_mm_sfence() nounwind {
+; X32-LABEL: test_mm_sfence:
+; X32: # BB#0:
+; X32-NEXT: sfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sfence:
+; X64: # BB#0:
+; X64-NEXT: sfence
+; X64-NEXT: retq
+ call void @llvm.x86.sse.sfence()
+ ret void
+}
+declare void @llvm.x86.sse.sfence() nounwind readnone
+
+define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_shuffle_ps:
+; X32: # BB#0:
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_sqrt_ps:
+; X32: # BB#0:
+; X32-NEXT: sqrtps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_ps:
+; X64: # BB#0:
+; X64-NEXT: sqrtps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_sqrt_ss:
+; X32: # BB#0:
+; X32-NEXT: sqrtss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_ss:
+; X64: # BB#0:
+; X64-NEXT: sqrtss %xmm0, %xmm0
+; X64-NEXT: retq
+ %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
+ %ext0 = extractelement <4 x float> %sqrt, i32 0
+ %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+ %ext1 = extractelement <4 x float> %a0, i32 1
+ %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+ %ext2 = extractelement <4 x float> %a0, i32 2
+ %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+ %ext3 = extractelement <4 x float> %a0, i32 3
+ %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+ ret <4 x float> %ins3
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ store <4 x float> %a1, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ps1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_ps1:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+ store <4 x float> %shuf, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store_ss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_ss:
+; X64: # BB#0:
+; X64-NEXT: movss %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <4 x float> %a1, i32 0
+ store float %ext, float* %a0, align 1
+ ret void
+}
+
+define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_store1_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store1_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
+ store <4 x float> %shuf, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_storeh_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, 4(%eax)
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeh_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a0 to i64*
+ %bc = bitcast <4 x float> %a1 to <2 x i64>
+ %ext = extractelement <2 x i64> %bc, i32 1
+ store i64 %ext, i64* %ptr
+ ret void
+}
+
+define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_storel_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl (%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, 4(%eax)
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storel_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: retq
+ %ptr = bitcast x86_mmx* %a0 to i64*
+ %bc = bitcast <4 x float> %a1 to <2 x i64>
+ %ext = extractelement <2 x i64> %bc, i32 0
+ store i64 %ext, i64* %ptr
+ ret void
+}
+
+define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_storer_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storer_ps:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ store <4 x float> %shuf, <4 x float>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_storeu_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeu_ps:
+; X64: # BB#0:
+; X64-NEXT: movups %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ store <4 x float> %a1, <4 x float>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_stream_ps:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_ps:
+; X64: # BB#0:
+; X64-NEXT: movntps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast float* %a0 to <4 x float>*
+ store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
+ ret void
+}
+
+define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_sub_ps:
+; X32: # BB#0:
+; X32-NEXT: subps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_ps:
+; X64: # BB#0:
+; X64-NEXT: subps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fsub <4 x float> %a0, %a1
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_sub_ss:
+; X32: # BB#0:
+; X32-NEXT: subss %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_ss:
+; X64: # BB#0:
+; X64-NEXT: subss %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <4 x float> %a0, i32 0
+ %ext1 = extractelement <4 x float> %a1, i32 0
+ %fsub = fsub float %ext0, %ext1
+ %res = insertelement <4 x float> %a0, float %fsub, i32 0
+ ret <4 x float> %res
+}
+
+define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
+; X32-LABEL: test_MM_TRANSPOSE4_PS:
+; X32: # BB#0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps (%esi), %xmm0
+; X32-NEXT: movaps (%edx), %xmm1
+; X32-NEXT: movaps (%ecx), %xmm2
+; X32-NEXT: movaps (%eax), %xmm3
+; X32-NEXT: movaps %xmm0, %xmm4
+; X32-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X32-NEXT: movaps %xmm2, %xmm5
+; X32-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X32-NEXT: movaps %xmm4, %xmm1
+; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X32-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X32-NEXT: movaps %xmm0, %xmm3
+; X32-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X32-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X32-NEXT: movaps %xmm1, (%esi)
+; X32-NEXT: movaps %xmm5, (%edx)
+; X32-NEXT: movaps %xmm3, (%ecx)
+; X32-NEXT: movaps %xmm2, (%eax)
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
+; X64-LABEL: test_MM_TRANSPOSE4_PS:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: movaps (%rsi), %xmm1
+; X64-NEXT: movaps (%rdx), %xmm2
+; X64-NEXT: movaps (%rcx), %xmm3
+; X64-NEXT: movaps %xmm0, %xmm4
+; X64-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; X64-NEXT: movaps %xmm2, %xmm5
+; X64-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT: movaps %xmm4, %xmm1
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X64-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
+; X64-NEXT: movaps %xmm0, %xmm3
+; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
+; X64-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
+; X64-NEXT: movaps %xmm1, (%rdi)
+; X64-NEXT: movaps %xmm5, (%rsi)
+; X64-NEXT: movaps %xmm3, (%rdx)
+; X64-NEXT: movaps %xmm2, (%rcx)
+; X64-NEXT: retq
+ %row0 = load <4 x float>, <4 x float>* %a0, align 16
+ %row1 = load <4 x float>, <4 x float>* %a1, align 16
+ %row2 = load <4 x float>, <4 x float>* %a2, align 16
+ %row3 = load <4 x float>, <4 x float>* %a3, align 16
+ %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ store <4 x float> %res0, <4 x float>* %a0, align 16
+ store <4 x float> %res1, <4 x float>* %a1, align 16
+ store <4 x float> %res2, <4 x float>* %a2, align 16
+ store <4 x float> %res3, <4 x float>* %a3, align 16
+ ret void
+}
+
+define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomieq_ss:
+; X32: # BB#0:
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomieq_ss:
+; X64: # BB#0:
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomige_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomige_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomigt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomigt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomile_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomile_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomilt_ss:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomiss %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomilt_ss:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomiss %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_ucomineq_ss:
+; X32: # BB#0:
+; X32-NEXT: ucomiss %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomineq_ss:
+; X64: # BB#0:
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_mm_undefined_ps() {
+; X32-LABEL: test_mm_undefined_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <4 x float> undef
+}
+
+define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_unpackhi_ps:
+; X32: # BB#0:
+; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_ps:
+; X64: # BB#0:
+; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_unpacklo_ps:
+; X32: # BB#0:
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_ps:
+; X64: # BB#0:
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_xor_ps:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: leal -4(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_xor_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
+; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: movq %rdx, %rsi
+; X64-NEXT: xorl %eax, %edx
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: xorl %r8d, %ecx
+; X64-NEXT: shrq $32, %r8
+; X64-NEXT: shrq $32, %rsi
+; X64-NEXT: shrq $32, %rdi
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %r8d, %edi
+; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %eax, %esi
+; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <4 x float> %a0 to <4 x i32>
+ %arg1 = bitcast <4 x float> %a1 to <4 x i32>
+ %res = xor <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <4 x float>
+ ret <4 x float> %bc
+}
+
+!0 = !{i32 1}
diff --git a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
new file mode 100644
index 000000000000..2900c277f124
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
+
+define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
+; SSE-LABEL: test_x86_sse_storeu_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movups %xmm0, (%eax)
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_storeu_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vmovups %xmm0, (%eax)
+; KNL-NEXT: retl
+; CHECK-LABEL: test_x86_sse_storeu_ps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movups %xmm0, (%eax)
+; CHECK-NEXT: retl
+ call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
+ ret void
+}
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
index 0857189be734..1df432185701 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: addss
+; SSE-LABEL: test_x86_sse_add_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_add_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -10,7 +19,15 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: cmpordps
+; SSE-LABEL: test_x86_sse_cmp_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cmp_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -18,7 +35,15 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: cmpordss
+; SSE-LABEL: test_x86_sse_cmp_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cmp_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -26,9 +51,23 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comieq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comieq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -36,9 +75,19 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comige_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comige_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -46,9 +95,19 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comigt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comigt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -56,9 +115,19 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comile_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comile_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -66,8 +135,19 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: sbb
+; SSE-LABEL: test_x86_sse_comilt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comiss %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comilt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomiss %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -75,9 +155,23 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: comiss
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_comineq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: comiss %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_comineq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomiss %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -85,8 +179,17 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
- ; CHECK: movl
- ; CHECK: cvtsi2ss
+; SSE-LABEL: test_x86_sse_cvtsi2ss:
+; SSE: ## BB#0:
+; SSE-NEXT: movl $7, %eax
+; SSE-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cvtsi2ss:
+; KNL: ## BB#0:
+; KNL-NEXT: movl $7, %eax
+; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -94,7 +197,15 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
- ; CHECK: cvtss2si
+; SSE-LABEL: test_x86_sse_cvtss2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtss2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cvtss2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtss2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -102,7 +213,15 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
- ; CHECK: cvttss2si
+; SSE-LABEL: test_x86_sse_cvttss2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttss2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_cvttss2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttss2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -110,7 +229,15 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: divss
+; SSE-LABEL: test_x86_sse_div_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_div_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -118,8 +245,17 @@ declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind read
define void @test_x86_sse_ldmxcsr(i8* %a0) {
- ; CHECK: movl
- ; CHECK: ldmxcsr
+; SSE-LABEL: test_x86_sse_ldmxcsr:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: ldmxcsr (%eax)
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ldmxcsr:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vldmxcsr (%eax)
+; KNL-NEXT: retl
call void @llvm.x86.sse.ldmxcsr(i8* %a0)
ret void
}
@@ -128,7 +264,15 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: maxps
+; SSE-LABEL: test_x86_sse_max_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: maxps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_max_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -136,7 +280,15 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: maxss
+; SSE-LABEL: test_x86_sse_max_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: maxss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_max_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -144,7 +296,15 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: minps
+; SSE-LABEL: test_x86_sse_min_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: minps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_min_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vminps %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -152,7 +312,15 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: minss
+; SSE-LABEL: test_x86_sse_min_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: minss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_min_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vminss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -160,7 +328,15 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
- ; CHECK: movmskps
+; SSE-LABEL: test_x86_sse_movmsk_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: movmskps %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_movmsk_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovmskps %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -169,7 +345,15 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: mulss
+; SSE-LABEL: test_x86_sse_mul_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_mul_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -177,7 +361,15 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
- ; CHECK: rcpps
+; SSE-LABEL: test_x86_sse_rcp_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: rcpps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rcp_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vrcpps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -185,7 +377,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
- ; CHECK: rcpss
+; SSE-LABEL: test_x86_sse_rcp_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: rcpss %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rcp_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -193,7 +393,15 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
- ; CHECK: rsqrtps
+; SSE-LABEL: test_x86_sse_rsqrt_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: rsqrtps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rsqrt_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vrsqrtps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -201,7 +409,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
- ; CHECK: rsqrtss
+; SSE-LABEL: test_x86_sse_rsqrt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: rsqrtss %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_rsqrt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -209,7 +425,15 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
- ; CHECK: sqrtps
+; SSE-LABEL: test_x86_sse_sqrt_ps:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_sqrt_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -217,7 +441,15 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
- ; CHECK: sqrtss
+; SSE-LABEL: test_x86_sse_sqrt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtss %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_sqrt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -225,25 +457,33 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @test_x86_sse_stmxcsr(i8* %a0) {
- ; CHECK: movl
- ; CHECK: stmxcsr
+; SSE-LABEL: test_x86_sse_stmxcsr:
+; SSE: ## BB#0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: stmxcsr (%eax)
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_stmxcsr:
+; KNL: ## BB#0:
+; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL-NEXT: vstmxcsr (%eax)
+; KNL-NEXT: retl
call void @llvm.x86.sse.stmxcsr(i8* %a0)
ret void
}
declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
-define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
- ; CHECK: movl
- ; CHECK: movups
- call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
- ret void
-}
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
-
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: subss
+; SSE-LABEL: test_x86_sse_sub_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: subss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_sub_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -251,9 +491,23 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomieq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomieq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -261,9 +515,19 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomige_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomige_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -271,9 +535,19 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomigt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomigt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -281,9 +555,19 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomile_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomile_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -291,8 +575,19 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: sbbl
+; SSE-LABEL: test_x86_sse_ucomilt_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomiss %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomilt_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomiss %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -300,9 +595,23 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: ucomiss
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse_ucomineq_ss:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse_ucomineq_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomiss %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index fd35e75d71ae..29c041ba7f6c 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -47,3 +47,18 @@ entry:
%a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
ret <4 x float> %a14
}
+
+; v4i32 isn't legal for SSE1, but this should be cmpps.
+
+define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: PR28044:
+; CHECK: # BB#0:
+; CHECK-NEXT: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: ret
+;
+ %cmp = fcmp oeq <4 x float> %a0, %a1
+ %sext = sext <4 x i1> %cmp to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <4 x float>
+ ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..f5ecfa444d86
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define i64 @test_mm_cvtsd_si64(<2 x double> %a0) nounwind {
+; X64-LABEL: test_mm_cvtsd_si64:
+; X64: # BB#0:
+; X64-NEXT: cvtsd2si %xmm0, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
+; X64-LABEL: test_mm_cvtsi128_si64:
+; X64: # BB#0:
+; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: retq
+ %res = extractelement <2 x i64> %a0, i32 0
+ ret i64 %res
+}
+
+define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
+; X64-LABEL: test_mm_cvtsi64_sd:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2sdq %rdi, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cvt = sitofp i64 %a1 to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
+; X64-LABEL: test_mm_cvtsi64_si128:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+ ret <2 x i64> %res1
+}
+
+define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
+; X64-LABEL: test_mm_cvttsd_si64:
+; X64: # BB#0:
+; X64-NEXT: cvttsd2si %xmm0, %rax
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a0, i32 0
+ %res = fptosi double %ext to i64
+ ret i64 %res
+}
+
+define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
+; X64-LABEL: test_mm_loadu_si64:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
+ %ld = load i64, i64* %a0, align 1
+ %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+ ret <2 x i64> %res1
+}
+
+define void @test_mm_stream_si64(i64 *%a0, i64 %a1) {
+; X64-LABEL: test_mm_stream_si64:
+; X64: # BB#0:
+; X64-NEXT: movntiq %rsi, (%rdi)
+; X64-NEXT: retq
+ store i64 %a1, i64* %a0, align 1, !nontemporal !0
+ ret void
+}
+
+!0 = !{i64 1}
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..fa71325d7d6e
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -0,0 +1,3849 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
+
+define <2 x i64> @test_mm_add_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi8:
+; X32: # BB#0:
+; X32-NEXT: paddb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi8:
+; X64: # BB#0:
+; X64-NEXT: paddb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = add <16 x i8> %arg0, %arg1
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi16:
+; X32: # BB#0:
+; X32-NEXT: paddw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi16:
+; X64: # BB#0:
+; X64-NEXT: paddw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = add <8 x i16> %arg0, %arg1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi32:
+; X32: # BB#0:
+; X32-NEXT: paddd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi32:
+; X64: # BB#0:
+; X64-NEXT: paddd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = add <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_add_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_add_epi64:
+; X32: # BB#0:
+; X32-NEXT: paddq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_epi64:
+; X64: # BB#0:
+; X64-NEXT: paddq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = add <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_add_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_add_pd:
+; X32: # BB#0:
+; X32-NEXT: addpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_pd:
+; X64: # BB#0:
+; X64-NEXT: addpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fadd <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_add_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_add_sd:
+; X32: # BB#0:
+; X32-NEXT: addsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_add_sd:
+; X64: # BB#0:
+; X64-NEXT: addsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fadd = fadd double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fadd, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_adds_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epi8:
+; X32: # BB#0:
+; X32-NEXT: paddsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epi8:
+; X64: # BB#0:
+; X64-NEXT: paddsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epi16:
+; X32: # BB#0:
+; X32-NEXT: paddsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epi16:
+; X64: # BB#0:
+; X64-NEXT: paddsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epu8:
+; X32: # BB#0:
+; X32-NEXT: paddusb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epu8:
+; X64: # BB#0:
+; X64-NEXT: paddusb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_adds_epu16:
+; X32: # BB#0:
+; X32-NEXT: paddusw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_adds_epu16:
+; X64: # BB#0:
+; X64-NEXT: paddusw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_and_pd:
+; X32: # BB#0:
+; X32-NEXT: andps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_and_pd:
+; X64: # BB#0:
+; X64-NEXT: andps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %res = and <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_and_si128:
+; X32: # BB#0:
+; X32-NEXT: andps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_and_si128:
+; X64: # BB#0:
+; X64-NEXT: andps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = and <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_pd:
+; X32: # BB#0:
+; X32-NEXT: andnps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_andnot_pd:
+; X64: # BB#0:
+; X64-NEXT: andnps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = and <4 x i32> %not, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_andnot_si128:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqd %xmm2, %xmm2
+; X32-NEXT: pxor %xmm2, %xmm0
+; X32-NEXT: pand %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_andnot_si128:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqd %xmm2, %xmm2
+; X64-NEXT: pxor %xmm2, %xmm0
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+ %not = xor <2 x i64> %a0, <i64 -1, i64 -1>
+ %res = and <2 x i64> %not, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_avg_epu8:
+; X32: # BB#0:
+; X32-NEXT: pavgb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_avg_epu8:
+; X64: # BB#0:
+; X64-NEXT: pavgb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
+
+define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_avg_epu16:
+; X32: # BB#0:
+; X32-NEXT: pavgw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_avg_epu16:
+; X64: # BB#0:
+; X64-NEXT: pavgw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_bslli_si128:
+; X32: # BB#0:
+; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_bslli_si128:
+; X64: # BB#0:
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_bsrli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_bsrli_si128:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_bsrli_si128:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <4 x float> @test_mm_castpd_ps(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_castpd_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castpd_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x double> %a0 to <4 x float>
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_castpd_si128(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_castpd_si128:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castpd_si128:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x double> %a0 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_castps_pd(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_castps_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castps_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x float> %a0 to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_castps_si128(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_castps_si128:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castps_si128:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <4 x float> %a0 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_castsi128_pd(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_castsi128_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castsi128_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x i64> %a0 to <2 x double>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_castsi128_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_castsi128_ps:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_castsi128_ps:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = bitcast <2 x i64> %a0 to <4 x float>
+ ret <4 x float> %res
+}
+
+define void @test_mm_clflush(i8* %a0) nounwind {
+; X32-LABEL: test_mm_clflush:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: clflush (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_clflush:
+; X64: # BB#0:
+; X64-NEXT: clflush (%rdi)
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.clflush(i8* %a0)
+ ret void
+}
+declare void @llvm.x86.sse2.clflush(i8*) nounwind readnone
+
+define <2 x i64> @test_mm_cmpeq_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi8:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi8:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp eq <16 x i8> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i8>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpeq_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi16:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi16:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp eq <8 x i16> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i16>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpeq_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_epi32:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi32:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp eq <4 x i32> %arg0, %arg1
+ %res = sext <4 x i1> %cmp to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmpeq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpeqpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpeqpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp oeq <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpeq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpeq_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpeqsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpeqsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mm_cmpge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_pd:
+; X32: # BB#0:
+; X32-NEXT: cmplepd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_pd:
+; X64: # BB#0:
+; X64-NEXT: cmplepd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ole <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpge_sd:
+; X32: # BB#0:
+; X32-NEXT: cmplesd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpge_sd:
+; X64: # BB#0:
+; X64-NEXT: cmplesd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x i64> @test_mm_cmpgt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi8:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi8:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp sgt <16 x i8> %arg0, %arg1
+ %res = sext <16 x i1> %cmp to <16 x i8>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpgt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi16:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi16:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp sgt <8 x i16> %arg0, %arg1
+ %res = sext <8 x i1> %cmp to <8 x i16>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmpgt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_epi32:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi32:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp sgt <4 x i32> %arg0, %arg1
+ %res = sext <4 x i1> %cmp to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmpgt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpltpd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpltpd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp olt <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpgt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpltsd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpltsd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x double> @test_mm_cmple_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_pd:
+; X32: # BB#0:
+; X32-NEXT: cmplepd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_pd:
+; X64: # BB#0:
+; X64-NEXT: cmplepd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ole <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmple_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmple_sd:
+; X32: # BB#0:
+; X32-NEXT: cmplesd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmple_sd:
+; X64: # BB#0:
+; X64-NEXT: cmplesd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 2)
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cmplt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi8:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtb %xmm0, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_epi8:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp sgt <16 x i8> %arg1, %arg0
+ %res = sext <16 x i1> %cmp to <16 x i8>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmplt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi16:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtw %xmm0, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_epi16:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp sgt <8 x i16> %arg1, %arg0
+ %res = sext <8 x i1> %cmp to <8 x i16>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_cmplt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_epi32:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtd %xmm0, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_epi32:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp sgt <4 x i32> %arg1, %arg0
+ %res = sext <4 x i1> %cmp to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_cmplt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpltpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpltpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp olt <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmplt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmplt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpltsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmplt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpltsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 1)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpneq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpneqpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpneqpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp une <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpneq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpneq_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpneqsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpneq_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpneqsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlepd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlepd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ugt <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnge_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlesd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnge_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlesd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x double> @test_mm_cmpngt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltpd %xmm0, %xmm1
+; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltpd %xmm0, %xmm1
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp uge <2 x double> %a1, %a0
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpngt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltsd %xmm0, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpngt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltsd %xmm0, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
+ %ext0 = extractelement <2 x double> %cmp, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+
+define <2 x double> @test_mm_cmpnle_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlepd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlepd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ugt <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnle_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnle_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnlesd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnle_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnlesd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 6)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnlt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp uge <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpnlt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpnlt_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpnltsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpnlt_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpnltsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 5)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpordpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpordpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp ord <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpord_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpordsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpord_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpordsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpunord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_pd:
+; X32: # BB#0:
+; X32-NEXT: cmpunordpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_pd:
+; X64: # BB#0:
+; X64-NEXT: cmpunordpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %fcmp = fcmp uno <2 x double> %a0, %a1
+ %sext = sext <2 x i1> %fcmp to <2 x i64>
+ %res = bitcast <2 x i64> %sext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_cmpunord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_cmpunord_sd:
+; X32: # BB#0:
+; X32-NEXT: cmpunordsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpunord_sd:
+; X64: # BB#0:
+; X64-NEXT: cmpunordsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 3)
+ ret <2 x double> %res
+}
+
+define i32 @test_mm_comieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comieq_sd:
+; X32: # BB#0:
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comieq_sd:
+; X64: # BB#0:
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comige_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comige_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comigt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comigt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comile_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comile_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comilt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: comisd %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comilt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: comisd %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_comineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_comineq_sd:
+; X32: # BB#0:
+; X32-NEXT: comisd %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_comineq_sd:
+; X64: # BB#0:
+; X64-NEXT: comisd %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_cvtepi32_pd(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtepi32_pd:
+; X32: # BB#0:
+; X32-NEXT: cvtdq2pd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi32_pd:
+; X64: # BB#0:
+; X64-NEXT: cvtdq2pd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = shufflevector <4 x i32> %arg0, <4 x i32> %arg0, <2 x i32> <i32 0, i32 1>
+ %res = sitofp <2 x i32> %ext to <2 x double>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_cvtepi32_ps(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtepi32_ps:
+; X32: # BB#0:
+; X32-NEXT: cvtdq2ps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi32_ps:
+; X64: # BB#0:
+; X64-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %arg0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_cvtpd_epi32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvtpd2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvtpd2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_cvtpd_ps(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtpd_ps:
+; X32: # BB#0:
+; X32-NEXT: cvtpd2ps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtpd_ps:
+; X64: # BB#0:
+; X64-NEXT: cvtpd2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_cvtps_epi32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvtps2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtps_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvtps2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_cvtps_pd(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvtps_pd:
+; X32: # BB#0:
+; X32-NEXT: cvtps2pd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtps_pd:
+; X64: # BB#0:
+; X64-NEXT: cvtps2pd %xmm0, %xmm0
+; X64-NEXT: retq
+ %ext = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
+ %res = fpext <2 x float> %ext to <2 x double>
+ ret <2 x double> %res
+}
+
+define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsd_f64:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: movlps %xmm0, (%esp)
+; X32-NEXT: fldl (%esp)
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_f64:
+; X64: # BB#0:
+; X64-NEXT: retq
+ %res = extractelement <2 x double> %a0, i32 0
+ ret double %res
+}
+
+define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsd_si32:
+; X32: # BB#0:
+; X32-NEXT: cvtsd2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsd_si32:
+; X64: # BB#0:
+; X64-NEXT: cvtsd2si %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+
+define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_cvtsi128_si32:
+; X32: # BB#0:
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi128_si32:
+; X64: # BB#0:
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = extractelement <4 x i32> %arg0, i32 0
+ ret i32 %res
+}
+
+define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
+; X32-LABEL: test_mm_cvtsi32_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cvtsi2sdl %eax, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi32_sd:
+; X64: # BB#0:
+; X64-NEXT: cvtsi2sdl %edi, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %cvt = sitofp i32 %a1 to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
+; X32-LABEL: test_mm_cvtsi32_si128:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtsi32_si128:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 0, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 0, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
+; X32-LABEL: test_mm_cvtss_sd:
+; X32: # BB#0:
+; X32-NEXT: cvtss2sd %xmm1, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtss_sd:
+; X64: # BB#0:
+; X64-NEXT: cvtss2sd %xmm1, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %ext = extractelement <4 x float> %a1, i32 0
+ %cvt = fpext float %ext to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_cvttpd_epi32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvttpd_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvttpd2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttpd_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvttpd2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
+; X32-LABEL: test_mm_cvttps_epi32:
+; X32: # BB#0:
+; X32-NEXT: cvttps2dq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttps_epi32:
+; X64: # BB#0:
+; X64-NEXT: cvttps2dq %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = fptosi <4 x float> %a0 to <4 x i32>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_cvttsd_si32:
+; X32: # BB#0:
+; X32-NEXT: cvttsd2si %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvttsd_si32:
+; X64: # BB#0:
+; X64-NEXT: cvttsd2si %xmm0, %eax
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a0, i32 0
+ %res = fptosi double %ext to i32
+ ret i32 %res
+}
+
+define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_div_pd:
+; X32: # BB#0:
+; X32-NEXT: divpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_pd:
+; X64: # BB#0:
+; X64-NEXT: divpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fdiv <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_div_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_div_sd:
+; X32: # BB#0:
+; X32-NEXT: divsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_div_sd:
+; X64: # BB#0:
+; X64-NEXT: divsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fdiv = fdiv double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fdiv, i32 0
+ ret <2 x double> %res
+}
+
+define i32 @test_mm_extract_epi16(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_extract_epi16:
+; X32: # BB#0:
+; X32-NEXT: pextrw $1, %xmm0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi16:
+; X64: # BB#0:
+; X64-NEXT: pextrw $1, %xmm0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext = extractelement <8 x i16> %arg0, i32 1
+ %res = zext i16 %ext to i32
+ ret i32 %res
+}
+
+define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
+; X32-LABEL: test_mm_insert_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: pinsrw $1, %eax, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi16:
+; X64: # BB#0:
+; X64-NEXT: pinsrw $1, %edi, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = insertelement <8 x i16> %arg0, i16 %a1,i32 1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define void @test_mm_lfence() nounwind {
+; X32-LABEL: test_mm_lfence:
+; X32: # BB#0:
+; X32-NEXT: lfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_lfence:
+; X64: # BB#0:
+; X64-NEXT: lfence
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.lfence()
+ ret void
+}
+declare void @llvm.x86.sse2.lfence() nounwind readnone
+
+define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_pd:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %res = load <2 x double>, <2 x double>* %arg0, align 16
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
+ %ld = load double, double* %a0, align 1
+ %res0 = insertelement <2 x double> undef, double %ld, i32 0
+ %res1 = insertelement <2 x double> %res0, double 0.0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm_load_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load_si128:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: retq
+ %res = load <2 x i64>, <2 x i64>* %a0, align 16
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_load1_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_load1_pd:
+; X64: # BB#0:
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %ld = load double, double* %a0, align 8
+ %res0 = insertelement <2 x double> undef, double %ld, i32 0
+ %res1 = insertelement <2 x double> %res0, double %ld, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm_loadh_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadh_pd:
+; X64: # BB#0:
+; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X64-NEXT: retq
+ %ld = load double, double* %a1, align 8
+ %res = insertelement <2 x double> %a0, double %ld, i32 1
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
+; X32-LABEL: test_mm_loadl_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadl_epi64:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
+ %bc = bitcast <2 x i64>* %a1 to i64*
+ %ld = load i64, i64* %bc, align 1
+ %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 0, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
+; X32-LABEL: test_mm_loadl_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadl_pd:
+; X64: # BB#0:
+; X64-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; X64-NEXT: retq
+ %ld = load double, double* %a1, align 8
+ %res = insertelement <2 x double> %a0, double %ld, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_loadr_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movapd (%eax), %xmm0
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadr_pd:
+; X64: # BB#0:
+; X64-NEXT: movapd (%rdi), %xmm0
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %ld = load <2 x double>, <2 x double>* %arg0, align 16
+ %res = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadu_pd:
+; X64: # BB#0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %res = load <2 x double>, <2 x double>* %arg0, align 1
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
+; X32-LABEL: test_mm_loadu_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_loadu_si128:
+; X64: # BB#0:
+; X64-NEXT: movups (%rdi), %xmm0
+; X64-NEXT: retq
+ %res = load <2 x i64>, <2 x i64>* %a0, align 1
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_madd_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_madd_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmaddwd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_madd_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmaddwd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
+define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) nounwind {
+; X32-LABEL: test_mm_maskmoveu_si128:
+; X32: # BB#0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: maskmovdqu %xmm1, %xmm0
+; X32-NEXT: popl %edi
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskmoveu_si128:
+; X64: # BB#0:
+; X64-NEXT: maskmovdqu %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %arg0, <16 x i8> %arg1, i8* %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind
+
+define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_max_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmaxsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmaxsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp sgt <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_max_epu8:
+; X32: # BB#0:
+; X32-NEXT: pmaxub %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epu8:
+; X64: # BB#0:
+; X64-NEXT: pmaxub %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp ugt <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_max_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_max_pd:
+; X32: # BB#0:
+; X32-NEXT: maxpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_pd:
+; X64: # BB#0:
+; X64-NEXT: maxpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_max_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_max_sd:
+; X32: # BB#0:
+; X32-NEXT: maxsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_sd:
+; X64: # BB#0:
+; X64-NEXT: maxsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define void @test_mm_mfence() nounwind {
+; X32-LABEL: test_mm_mfence:
+; X32: # BB#0:
+; X32-NEXT: mfence
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mfence:
+; X64: # BB#0:
+; X64-NEXT: mfence
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.mfence()
+ ret void
+}
+declare void @llvm.x86.sse2.mfence() nounwind readnone
+
+define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_min_epi16:
+; X32: # BB#0:
+; X32-NEXT: pminsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epi16:
+; X64: # BB#0:
+; X64-NEXT: pminsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp slt <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_min_epu8:
+; X32: # BB#0:
+; X32-NEXT: pminub %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epu8:
+; X64: # BB#0:
+; X64-NEXT: pminub %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp ult <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_min_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_min_pd:
+; X32: # BB#0:
+; X32-NEXT: minpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_pd:
+; X64: # BB#0:
+; X64-NEXT: minpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_min_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_min_sd:
+; X32: # BB#0:
+; X32-NEXT: minsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_sd:
+; X64: # BB#0:
+; X64-NEXT: minsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_move_epi64(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_move_epi64:
+; X32: # BB#0:
+; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_move_epi64:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_move_sd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_move_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a1, i32 0
+ %res0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a0, i32 1
+ %res1 = insertelement <2 x double> %res0, double %ext1, i32 1
+ ret <2 x double> %res1
+}
+
+define i32 @test_mm_movemask_epi8(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_epi8:
+; X32: # BB#0:
+; X32-NEXT: pmovmskb %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movemask_epi8:
+; X64: # BB#0:
+; X64-NEXT: pmovmskb %xmm0, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %arg0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_movemask_pd:
+; X32: # BB#0:
+; X32-NEXT: movmskpd %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_movemask_pd:
+; X64: # BB#0:
+; X64-NEXT: movmskpd %xmm0, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mul_epu32:
+; X32: # BB#0:
+; X32-NEXT: pmuludq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_epu32:
+; X64: # BB#0:
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_mul_pd:
+; X32: # BB#0:
+; X32-NEXT: mulpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_pd:
+; X64: # BB#0:
+; X64-NEXT: mulpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fmul <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_mul_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_mul_sd:
+; X32: # BB#0:
+; X32-NEXT: mulsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_sd:
+; X64: # BB#0:
+; X64-NEXT: mulsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fmul = fmul double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fmul, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_mulhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmulhw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mulhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmulhw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mulhi_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mulhi_epu16:
+; X32: # BB#0:
+; X32-NEXT: pmulhuw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mulhi_epu16:
+; X64: # BB#0:
+; X64-NEXT: pmulhuw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mullo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mullo_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmullw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mullo_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = mul <8 x i16> %arg0, %arg1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_or_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_or_pd:
+; X32: # BB#0:
+; X32-NEXT: orps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_or_pd:
+; X64: # BB#0:
+; X64-NEXT: orps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %res = or <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_or_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_or_si128:
+; X32: # BB#0:
+; X32-NEXT: orps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_or_si128:
+; X64: # BB#0:
+; X64-NEXT: orps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = or <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_packs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packs_epi16:
+; X32: # BB#0:
+; X32-NEXT: packsswb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packs_epi16:
+; X64: # BB#0:
+; X64-NEXT: packsswb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packs_epi32:
+; X32: # BB#0:
+; X32-NEXT: packssdw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packs_epi32:
+; X64: # BB#0:
+; X64-NEXT: packssdw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packus_epi16:
+; X32: # BB#0:
+; X32-NEXT: packuswb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packus_epi16:
+; X64: # BB#0:
+; X64-NEXT: packuswb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define void @test_mm_pause() nounwind {
+; X32-LABEL: test_mm_pause:
+; X32: # BB#0:
+; X32-NEXT: pause
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_pause:
+; X64: # BB#0:
+; X64-NEXT: pause
+; X64-NEXT: retq
+ call void @llvm.x86.sse2.pause()
+ ret void
+}
+declare void @llvm.x86.sse2.pause() nounwind readnone
+
+define <2 x i64> @test_mm_sad_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sad_epu8:
+; X32: # BB#0:
+; X32-NEXT: psadbw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sad_epu8:
+; X64: # BB#0:
+; X64-NEXT: psadbw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %arg0, <16 x i8> %arg1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
+; X32-LABEL: test_mm_set_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i8> undef, i8 %a15, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %a14, i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %a13, i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %a12, i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %a11, i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %a10, i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %a9 , i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %a8 , i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %a7 , i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %a6 , i32 9
+ %res10 = insertelement <16 x i8> %res9, i8 %a5 , i32 10
+ %res11 = insertelement <16 x i8> %res10, i8 %a4 , i32 11
+ %res12 = insertelement <16 x i8> %res11, i8 %a3 , i32 12
+ %res13 = insertelement <16 x i8> %res12, i8 %a2 , i32 13
+ %res14 = insertelement <16 x i8> %res13, i8 %a1 , i32 14
+ %res15 = insertelement <16 x i8> %res14, i8 %a0 , i32 15
+ %res = bitcast <16 x i8> %res15 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
+; X32-LABEL: test_mm_set_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm5
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm6
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm7
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %r8d, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-NEXT: movd %esi, %xmm0
+; X64-NEXT: movd %r9d, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %ecx, %xmm3
+; X64-NEXT: movd %r10d, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i16> undef, i16 %a7, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %a6, i32 1
+ %res2 = insertelement <8 x i16> %res1, i16 %a5, i32 2
+ %res3 = insertelement <8 x i16> %res2, i16 %a4, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %a3, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %a2, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 %a1, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 %a0, i32 7
+ %res = bitcast <8 x i16> %res7 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_set_epi32:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi32:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a3, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %a2, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 %a1, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 %a0, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+; TODO test_mm_set_epi64
+
+define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
+; X32-LABEL: test_mm_set_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_epi64x:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm1
+; X64-NEXT: movd %rsi, %xmm0
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a1, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %a0, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
+; X32-LABEL: test_mm_set_pd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_pd:
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a1, i32 0
+ %res1 = insertelement <2 x double> %res0, double %a0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_set_sd(double %a0) nounwind {
+; X32-LABEL: test_mm_set_sd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set_sd:
+; X64: # BB#0:
+; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a0, i32 0
+ %res1 = insertelement <2 x double> %res0, double 0.0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i8> undef, i8 %a0, i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %a0, i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %a0, i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %a0, i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %a0, i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %a0, i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %a0, i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %a0, i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %a0, i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %a0, i32 9
+ %res10 = insertelement <16 x i8> %res9, i8 %a0, i32 10
+ %res11 = insertelement <16 x i8> %res10, i8 %a0, i32 11
+ %res12 = insertelement <16 x i8> %res11, i8 %a0, i32 12
+ %res13 = insertelement <16 x i8> %res12, i8 %a0, i32 13
+ %res14 = insertelement <16 x i8> %res13, i8 %a0, i32 14
+ %res15 = insertelement <16 x i8> %res14, i8 %a0, i32 15
+ %res = bitcast <16 x i8> %res15 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi16:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %a0, i32 1
+ %res2 = insertelement <8 x i16> %res1, i16 %a0, i32 2
+ %res3 = insertelement <8 x i16> %res2, i16 %a0, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %a0, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %a0, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 %a0, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 %a0, i32 7
+ %res = bitcast <8 x i16> %res7 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi32:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi32:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %a0, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 %a0, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 %a0, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+; TODO test_mm_set1_epi64
+
+define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
+; X32-LABEL: test_mm_set1_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_epi64x:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %a0, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
+; X32-LABEL: test_mm_set1_pd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_set1_pd:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a0, i32 0
+ %res1 = insertelement <2 x double> %res0, double %a0, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
+; X32-LABEL: test_mm_setr_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %r9b, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %sil, %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %dl, %eax
+; X64-NEXT: movd %eax, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movzbl %r8b, %eax
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movd %eax, %xmm4
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %res0 = insertelement <16 x i8> undef, i8 %a0 , i32 0
+ %res1 = insertelement <16 x i8> %res0, i8 %a1 , i32 1
+ %res2 = insertelement <16 x i8> %res1, i8 %a2 , i32 2
+ %res3 = insertelement <16 x i8> %res2, i8 %a3 , i32 3
+ %res4 = insertelement <16 x i8> %res3, i8 %a4 , i32 4
+ %res5 = insertelement <16 x i8> %res4, i8 %a5 , i32 5
+ %res6 = insertelement <16 x i8> %res5, i8 %a6 , i32 6
+ %res7 = insertelement <16 x i8> %res6, i8 %a7 , i32 7
+ %res8 = insertelement <16 x i8> %res7, i8 %a8 , i32 8
+ %res9 = insertelement <16 x i8> %res8, i8 %a9 , i32 9
+ %res10 = insertelement <16 x i8> %res9, i8 %a10, i32 10
+ %res11 = insertelement <16 x i8> %res10, i8 %a11, i32 11
+ %res12 = insertelement <16 x i8> %res11, i8 %a12, i32 12
+ %res13 = insertelement <16 x i8> %res12, i8 %a13, i32 13
+ %res14 = insertelement <16 x i8> %res13, i8 %a14, i32 14
+ %res15 = insertelement <16 x i8> %res14, i8 %a15, i32 15
+ %res = bitcast <16 x i8> %res15 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
+; X32-LABEL: test_mm_setr_epi16:
+; X32: # BB#0:
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm2
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm3
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm4
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm5
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm6
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm7
+; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movd %eax, %xmm0
+; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi16:
+; X64: # BB#0:
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
+; X64-NEXT: movd %eax, %xmm0
+; X64-NEXT: movd %ecx, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %r9d, %xmm0
+; X64-NEXT: movd %esi, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-NEXT: movd %r10d, %xmm0
+; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT: movd %r8d, %xmm3
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: retq
+ %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+ %res1 = insertelement <8 x i16> %res0, i16 %a1, i32 1
+ %res2 = insertelement <8 x i16> %res1, i16 %a2, i32 2
+ %res3 = insertelement <8 x i16> %res2, i16 %a3, i32 3
+ %res4 = insertelement <8 x i16> %res3, i16 %a4, i32 4
+ %res5 = insertelement <8 x i16> %res4, i16 %a5, i32 5
+ %res6 = insertelement <8 x i16> %res5, i16 %a6, i32 6
+ %res7 = insertelement <8 x i16> %res6, i16 %a7, i32 7
+ %res = bitcast <8 x i16> %res7 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_setr_epi32:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi32:
+; X64: # BB#0:
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movd %esi, %xmm1
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movd %edx, %xmm2
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %a1, i32 1
+ %res2 = insertelement <4 x i32> %res1, i32 %a2, i32 2
+ %res3 = insertelement <4 x i32> %res2, i32 %a3, i32 3
+ %res = bitcast <4 x i32> %res3 to <2 x i64>
+ ret <2 x i64> %res
+}
+
+; TODO test_mm_setr_epi64
+
+define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
+; X32-LABEL: test_mm_setr_epi64x:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_epi64x:
+; X64: # BB#0:
+; X64-NEXT: movd %rsi, %xmm1
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
+ %res1 = insertelement <2 x i64> %res0, i64 %a1, i32 1
+ ret <2 x i64> %res1
+}
+
+define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
+; X32-LABEL: test_mm_setr_pd:
+; X32: # BB#0:
+; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setr_pd:
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res0 = insertelement <2 x double> undef, double %a0, i32 0
+ %res1 = insertelement <2 x double> %res0, double %a1, i32 1
+ ret <2 x double> %res1
+}
+
+define <2 x double> @test_mm_setzero_pd() {
+; X32-LABEL: test_mm_setzero_pd:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setzero_pd:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
+ ret <2 x double> zeroinitializer
+}
+
+define <2 x i64> @test_mm_setzero_si128() {
+; X32-LABEL: test_mm_setzero_si128:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_setzero_si128:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
+ ret <2 x i64> zeroinitializer
+}
+
+define <2 x i64> @test_mm_shuffle_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shuffle_epi32:
+; X32: # BB#0:
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_epi32:
+; X64: # BB#0:
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_shuffle_pd:
+; X32: # BB#0:
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shuffle_pd:
+; X64: # BB#0:
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_shufflehi_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shufflehi_epi16:
+; X32: # BB#0:
+; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shufflehi_epi16:
+; X64: # BB#0:
+; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_shufflelo_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_shufflelo_epi16:
+; X32: # BB#0:
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shufflelo_epi16:
+; X64: # BB#0:
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sll_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi16:
+; X32: # BB#0:
+; X32-NEXT: psllw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sll_epi16:
+; X64: # BB#0:
+; X64-NEXT: psllw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sll_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi32:
+; X32: # BB#0:
+; X32-NEXT: pslld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sll_epi32:
+; X64: # BB#0:
+; X64-NEXT: pslld %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_sll_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sll_epi64:
+; X32: # BB#0:
+; X32-NEXT: psllq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sll_epi64:
+; X64: # BB#0:
+; X64-NEXT: psllq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi16:
+; X32: # BB#0:
+; X32-NEXT: psllw $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_epi16:
+; X64: # BB#0:
+; X64-NEXT: psllw $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi32:
+; X32: # BB#0:
+; X32-NEXT: pslld $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_epi32:
+; X64: # BB#0:
+; X64-NEXT: pslld $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_slli_epi64:
+; X32: # BB#0:
+; X32-NEXT: psllq $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_epi64:
+; X64: # BB#0:
+; X64-NEXT: psllq $1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_slli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_slli_si128:
+; X32: # BB#0:
+; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_slli_si128:
+; X64: # BB#0:
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> zeroinitializer, <16 x i8> %arg0, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {
+; X32-LABEL: test_mm_sqrt_pd:
+; X32: # BB#0:
+; X32-NEXT: sqrtpd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_pd:
+; X64: # BB#0:
+; X64-NEXT: sqrtpd %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sqrt_sd:
+; X32: # BB#0:
+; X32-NEXT: sqrtsd %xmm0, %xmm1
+; X32-NEXT: movaps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sqrt_sd:
+; X64: # BB#0:
+; X64-NEXT: sqrtsd %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+ %call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
+ %ext0 = extractelement <2 x double> %call, i32 0
+ %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 1
+ %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+ ret <2 x double> %ins1
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sra_epi16:
+; X32: # BB#0:
+; X32-NEXT: psraw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sra_epi16:
+; X64: # BB#0:
+; X64-NEXT: psraw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sra_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sra_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrad %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sra_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrad %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srai_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srai_epi16:
+; X32: # BB#0:
+; X32-NEXT: psraw $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srai_epi16:
+; X64: # BB#0:
+; X64-NEXT: psraw $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %arg0, i32 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srai_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srai_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrad $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srai_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrad $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %arg0, i32 1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi16:
+; X32: # BB#0:
+; X32-NEXT: psrlw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srl_epi16:
+; X64: # BB#0:
+; X64-NEXT: psrlw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srl_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrld %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_srl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_srl_epi64:
+; X32: # BB#0:
+; X32-NEXT: psrlq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srl_epi64:
+; X64: # BB#0:
+; X64-NEXT: psrlq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi16:
+; X32: # BB#0:
+; X32-NEXT: psrlw $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_epi16:
+; X64: # BB#0:
+; X64-NEXT: psrlw $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %arg0, i32 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi32:
+; X32: # BB#0:
+; X32-NEXT: psrld $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_epi32:
+; X64: # BB#0:
+; X64-NEXT: psrld $1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %arg0, i32 1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_srli_epi64:
+; X32: # BB#0:
+; X32-NEXT: psrlq $1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_epi64:
+; X64: # BB#0:
+; X64-NEXT: psrlq $1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_srli_si128(<2 x i64> %a0) nounwind {
+; X32-LABEL: test_mm_srli_si128:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_srli_si128:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_pd:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ store <2 x double> %a1, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_pd1(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_pd1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_pd1:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double * %a0 to <2 x double>*
+ %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ store <2 x double> %shuf, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a1, i32 0
+ store double %ext, double* %a0, align 1
+ ret void
+}
+
+define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_store_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store_si128:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ store <2 x i64> %a1, <2 x i64>* %a0, align 16
+ ret void
+}
+
+define void @test_mm_store1_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_store1_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT: movaps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_store1_pd:
+; X64: # BB#0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double * %a0 to <2 x double>*
+ %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
+ store <2 x double> %shuf, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storeh_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeh_sd:
+; X64: # BB#0:
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: movsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a1, i32 1
+ store double %ext, double* %a0, align 8
+ ret void
+}
+
+define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_storel_epi64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movlps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storel_epi64:
+; X64: # BB#0:
+; X64-NEXT: movd %xmm0, %rax
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x i64> %a1, i32 0
+ %bc = bitcast <2 x i64> *%a0 to i64*
+ store i64 %ext, i64* %bc, align 8
+ ret void
+}
+
+define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storel_sd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storel_sd:
+; X64: # BB#0:
+; X64-NEXT: movsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %ext = extractelement <2 x double> %a1, i32 0
+ store double %ext, double* %a0, align 8
+ ret void
+}
+
+define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storer_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movapd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storer_pd:
+; X64: # BB#0:
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: movapd %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ store <2 x double> %shuf, <2 x double>* %arg0, align 16
+ ret void
+}
+
+define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_storeu_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeu_pd:
+; X64: # BB#0:
+; X64-NEXT: movups %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ store <2 x double> %a1, <2 x double>* %arg0, align 1
+ ret void
+}
+
+define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_storeu_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_storeu_si128:
+; X64: # BB#0:
+; X64-NEXT: movups %xmm0, (%rdi)
+; X64-NEXT: retq
+ store <2 x i64> %a1, <2 x i64>* %a0, align 1
+ ret void
+}
+
+define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_stream_pd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_pd:
+; X64: # BB#0:
+; X64-NEXT: movntps %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arg0 = bitcast double* %a0 to <2 x double>*
+ store <2 x double> %a1, <2 x double>* %arg0, align 16, !nontemporal !0
+ ret void
+}
+
+define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
+; X32-LABEL: test_mm_stream_si32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movntil %eax, (%ecx)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_si32:
+; X64: # BB#0:
+; X64-NEXT: movntil %esi, (%rdi)
+; X64-NEXT: retq
+ store i32 %a1, i32* %a0, align 1, !nontemporal !0
+ ret void
+}
+
+define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_stream_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntps %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_si128:
+; X64: # BB#0:
+; X64-NEXT: movntps %xmm0, (%rdi)
+; X64-NEXT: retq
+ store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0
+ ret void
+}
+
+define <2 x i64> @test_mm_sub_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi8:
+; X32: # BB#0:
+; X32-NEXT: psubb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi8:
+; X64: # BB#0:
+; X64-NEXT: psubb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = sub <16 x i8> %arg0, %arg1
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi16:
+; X32: # BB#0:
+; X32-NEXT: psubw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi16:
+; X64: # BB#0:
+; X64-NEXT: psubw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = sub <8 x i16> %arg0, %arg1
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi32:
+; X32: # BB#0:
+; X32-NEXT: psubd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi32:
+; X64: # BB#0:
+; X64-NEXT: psubd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = sub <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_sub_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_sub_epi64:
+; X32: # BB#0:
+; X32-NEXT: psubq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_epi64:
+; X64: # BB#0:
+; X64-NEXT: psubq %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = sub <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_sub_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sub_pd:
+; X32: # BB#0:
+; X32-NEXT: subpd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_pd:
+; X64: # BB#0:
+; X64-NEXT: subpd %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = fsub <2 x double> %a0, %a1
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_sub_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_sub_sd:
+; X32: # BB#0:
+; X32-NEXT: subsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sub_sd:
+; X64: # BB#0:
+; X64-NEXT: subsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %ext0 = extractelement <2 x double> %a0, i32 0
+ %ext1 = extractelement <2 x double> %a1, i32 0
+ %fsub = fsub double %ext0, %ext1
+ %res = insertelement <2 x double> %a0, double %fsub, i32 0
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_subs_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epi8:
+; X32: # BB#0:
+; X32-NEXT: psubsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epi8:
+; X64: # BB#0:
+; X64-NEXT: psubsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epi16:
+; X32: # BB#0:
+; X32-NEXT: psubsw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epi16:
+; X64: # BB#0:
+; X64-NEXT: psubsw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epu8:
+; X32: # BB#0:
+; X32-NEXT: psubusb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epu8:
+; X64: # BB#0:
+; X64-NEXT: psubusb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_subs_epu16:
+; X32: # BB#0:
+; X32-NEXT: psubusw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_subs_epu16:
+; X64: # BB#0:
+; X64-NEXT: psubusw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomieq_sd:
+; X32: # BB#0:
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: andb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomieq_sd:
+; X64: # BB#0:
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: setnp %al
+; X64-NEXT: sete %cl
+; X64-NEXT: andb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomige_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomige_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomigt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomigt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomile_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm0, %xmm1
+; X32-NEXT: setae %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomile_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm0, %xmm1
+; X64-NEXT: setae %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomilt_sd:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ucomisd %xmm0, %xmm1
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomilt_sd:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ucomisd %xmm0, %xmm1
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @test_mm_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_ucomineq_sd:
+; X32: # BB#0:
+; X32-NEXT: ucomisd %xmm1, %xmm0
+; X32-NEXT: setp %al
+; X32-NEXT: setne %cl
+; X32-NEXT: orb %al, %cl
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ucomineq_sd:
+; X64: # BB#0:
+; X64-NEXT: ucomisd %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: setne %cl
+; X64-NEXT: orb %al, %cl
+; X64-NEXT: movzbl %cl, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_mm_undefined_pd() {
+; X32-LABEL: test_mm_undefined_pd:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_pd:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <2 x double> undef
+}
+
+define <2 x i64> @test_mm_undefined_si128() {
+; X32-LABEL: test_mm_undefined_si128:
+; X32: # BB#0:
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_undefined_si128:
+; X64: # BB#0:
+; X64-NEXT: retq
+ ret <2 x i64> undef
+}
+
+define <2 x i64> @test_mm_unpackhi_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi8:
+; X32: # BB#0:
+; X32-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi8:
+; X64: # BB#0:
+; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi16:
+; X32: # BB#0:
+; X32-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi16:
+; X64: # BB#0:
+; X64-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi32:
+; X32: # BB#0:
+; X32-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi32:
+; X64: # BB#0:
+; X64-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpackhi_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpackhi_epi64:
+; X32: # BB#0:
+; X32-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_epi64:
+; X64: # BB#0:
+; X64-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_unpackhi_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_unpackhi_pd:
+; X32: # BB#0:
+; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpackhi_pd:
+; X64: # BB#0:
+; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+
+define <2 x i64> @test_mm_unpacklo_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi8:
+; X32: # BB#0:
+; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi8:
+; X64: # BB#0:
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi16:
+; X32: # BB#0:
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi16:
+; X64: # BB#0:
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi32:
+; X32: # BB#0:
+; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi32:
+; X64: # BB#0:
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = shufflevector <4 x i32> %arg0,<4 x i32> %arg1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_unpacklo_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_unpacklo_epi64:
+; X32: # BB#0:
+; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_epi64:
+; X64: # BB#0:
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_unpacklo_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_unpacklo_pd:
+; X32: # BB#0:
+; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_unpacklo_pd:
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_xor_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
+; X32-LABEL: test_mm_xor_pd:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_xor_pd:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x double> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x double> %a1 to <4 x i32>
+ %res = xor <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x double>
+ ret <2 x double> %bc
+}
+
+define <2 x i64> @test_mm_xor_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; X32-LABEL: test_mm_xor_si128:
+; X32: # BB#0:
+; X32-NEXT: xorps %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_xor_si128:
+; X64: # BB#0:
+; X64-NEXT: xorps %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = xor <2 x i64> %a0, %a1
+ ret <2 x i64> %res
+}
+
+!0 = !{i32 1}
+
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index b0412b96bdb2..ae6626bb0dc5 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -1,7 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=pentium4 -mattr=sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
- ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+; CHECK-LABEL: test_x86_sse2_psll_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -9,14 +13,20 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
- ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+; CHECK-LABEL: test_x86_sse2_psrl_dq_bs:
+; CHECK: ## BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
- ; CHECK: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-LABEL: test_x86_sse2_psll_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -24,8 +34,166 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
- ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-LABEL: test_x86_sse2_psrl_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtps2pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: cvtps2pd %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvttps2dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_x86_sse2_storel_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movlps %xmm0, (%eax)
+; CHECK-NEXT: retl
+ call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
+ ret void
+}
+declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
+
+
+define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
+ ; add operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_dq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: paddb LCPI8_0, %xmm0
+; CHECK-NEXT: movdqu %xmm0, (%eax)
+; CHECK-NEXT: retl
+ %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+
+define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
+ ; fadd operation forces the execution domain.
+; CHECK-LABEL: test_x86_sse2_storeu_pd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: addpd %xmm0, %xmm1
+; CHECK-NEXT: movupd %xmm1, (%eax)
+; CHECK-NEXT: retl
+ %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
+ call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
+ ret void
+}
+declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
+
+define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {
+; CHECK-LABEL: test_x86_sse2_pshuf_d:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: retl
+entry:
+ %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone
+
+define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_pshufl_w:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NEXT: retl
+entry:
+ %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone
+
+define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {
+; CHECK-LABEL: test_x86_sse2_pshufh_w:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-NEXT: retl
+entry:
+ %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone
+
+define <16 x i8> @max_epu8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: max_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxub %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @min_epu8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: min_epu8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminub %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @max_epi16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: max_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxsw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: min_epi16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminsw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 53132a167fb8..617e30e4b92c 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: addsd
+; SSE-LABEL: test_x86_sse2_add_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_add_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -10,7 +19,15 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: cmpordpd
+; SSE-LABEL: test_x86_sse2_cmp_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordpd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cmp_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -18,7 +35,15 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: cmpordsd
+; SSE-LABEL: test_x86_sse2_cmp_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: cmpordsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cmp_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -26,9 +51,23 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comieq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comieq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -36,9 +75,19 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comige_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comige_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -46,9 +95,19 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comigt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comigt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -56,9 +115,19 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comile_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comile_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -66,9 +135,19 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: sbbl %eax, %eax
- ; CHECK: andl $1, %eax
+; SSE-LABEL: test_x86_sse2_comilt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: comisd %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comilt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vcomisd %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -76,25 +155,39 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: comisd
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_comineq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: comisd %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_comineq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcomisd %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
- ; CHECK: cvtdq2pd
- %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
-
-
define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
- ; CHECK: cvtdq2ps
+; SSE-LABEL: test_x86_sse2_cvtdq2ps:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtdq2ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtdq2ps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -102,7 +195,15 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
- ; CHECK: cvtpd2dq
+; SSE-LABEL: test_x86_sse2_cvtpd2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtpd2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtpd2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2dq %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -110,7 +211,15 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
- ; CHECK: cvtpd2ps
+; SSE-LABEL: test_x86_sse2_cvtpd2ps:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtpd2ps %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtpd2ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtpd2ps %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -118,23 +227,31 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
- ; CHECK: cvtps2dq
+; SSE-LABEL: test_x86_sse2_cvtps2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtps2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtps2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtps2dq %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
- ; CHECK: cvtps2pd
- %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
-
-
define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
- ; CHECK: cvtsd2si
+; SSE-LABEL: test_x86_sse2_cvtsd2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtsd2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtsd2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -142,25 +259,47 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
- ; CHECK: cvtsd2ss
- ; SSE-NOT: cvtsd2ss %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; SSE-LABEL: test_x86_sse2_cvtsd2ss:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtsd2ss %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsd2ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
-define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0) {
- ; CHECK: movl
- ; CHECK: cvtsi2sd
- %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
+; SSE-LABEL: test_x86_sse2_cvtsi2sd:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtsi2sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; KNL-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
- ; CHECK: cvtss2sd
+; SSE-LABEL: test_x86_sse2_cvtss2sd:
+; SSE: ## BB#0:
+; SSE-NEXT: cvtss2sd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvtss2sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -168,23 +307,31 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
- ; CHECK: cvttpd2dq
+; SSE-LABEL: test_x86_sse2_cvttpd2dq:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvttpd2dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
-define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
- ; CHECK: cvttps2dq
- %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-
define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
- ; CHECK: cvttsd2si
+; SSE-LABEL: test_x86_sse2_cvttsd2si:
+; SSE: ## BB#0:
+; SSE-NEXT: cvttsd2si %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_cvttsd2si:
+; KNL: ## BB#0:
+; KNL-NEXT: vcvttsd2si %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -192,7 +339,15 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: divsd
+; SSE-LABEL: test_x86_sse2_div_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_div_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -201,7 +356,15 @@ declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: maxpd
+; SSE-LABEL: test_x86_sse2_max_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: maxpd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_max_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -209,7 +372,15 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: maxsd
+; SSE-LABEL: test_x86_sse2_max_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: maxsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_max_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -217,7 +388,15 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: minpd
+; SSE-LABEL: test_x86_sse2_min_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: minpd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_min_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vminpd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -225,7 +404,15 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: minsd
+; SSE-LABEL: test_x86_sse2_min_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: minsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_min_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vminsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -233,7 +420,15 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
- ; CHECK: movmskpd
+; SSE-LABEL: test_x86_sse2_movmsk_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: movmskpd %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_movmsk_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmovmskpd %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -243,8 +438,15 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: test_x86_sse2_mul_sd
- ; CHECK: mulsd
+; SSE-LABEL: test_x86_sse2_mul_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: mulsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_mul_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -252,7 +454,15 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind
define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: packssdw
+; SSE-LABEL: test_x86_sse2_packssdw_128:
+; SSE: ## BB#0:
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_packssdw_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -260,7 +470,15 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea
define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: packsswb
+; SSE-LABEL: test_x86_sse2_packsswb_128:
+; SSE: ## BB#0:
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_packsswb_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -268,7 +486,15 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: packuswb
+; SSE-LABEL: test_x86_sse2_packuswb_128:
+; SSE: ## BB#0:
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_packuswb_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -276,7 +502,15 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: paddsb
+; SSE-LABEL: test_x86_sse2_padds_b:
+; SSE: ## BB#0:
+; SSE-NEXT: paddsb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_padds_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -284,7 +518,15 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: paddsw
+; SSE-LABEL: test_x86_sse2_padds_w:
+; SSE: ## BB#0:
+; SSE-NEXT: paddsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_padds_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -292,7 +534,15 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: paddusb
+; SSE-LABEL: test_x86_sse2_paddus_b:
+; SSE: ## BB#0:
+; SSE-NEXT: paddusb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_paddus_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -300,7 +550,15 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: paddusw
+; SSE-LABEL: test_x86_sse2_paddus_w:
+; SSE: ## BB#0:
+; SSE-NEXT: paddusw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_paddus_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -308,7 +566,15 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pavgb
+; SSE-LABEL: test_x86_sse2_pavg_b:
+; SSE: ## BB#0:
+; SSE-NEXT: pavgb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pavg_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -316,7 +582,15 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pavgw
+; SSE-LABEL: test_x86_sse2_pavg_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pavgw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pavg_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -324,7 +598,15 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmaddwd
+; SSE-LABEL: test_x86_sse2_pmadd_wd:
+; SSE: ## BB#0:
+; SSE-NEXT: pmaddwd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmadd_wd:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -332,7 +614,15 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmaxsw
+; SSE-LABEL: test_x86_sse2_pmaxs_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pmaxsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmaxs_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -340,7 +630,15 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pmaxub
+; SSE-LABEL: test_x86_sse2_pmaxu_b:
+; SSE: ## BB#0:
+; SSE-NEXT: pmaxub %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmaxu_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -348,7 +646,15 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pminsw
+; SSE-LABEL: test_x86_sse2_pmins_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pminsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmins_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -356,7 +662,15 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pminub
+; SSE-LABEL: test_x86_sse2_pminu_b:
+; SSE: ## BB#0:
+; SSE-NEXT: pminub %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pminu_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -364,7 +678,15 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
- ; CHECK: pmovmskb
+; SSE-LABEL: test_x86_sse2_pmovmskb_128:
+; SSE: ## BB#0:
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmovmskb_128:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmovmskb %xmm0, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -372,7 +694,15 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmulhw
+; SSE-LABEL: test_x86_sse2_pmulh_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pmulhw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmulh_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -380,7 +710,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmulhuw
+; SSE-LABEL: test_x86_sse2_pmulhu_w:
+; SSE: ## BB#0:
+; SSE-NEXT: pmulhuw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmulhu_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -388,7 +726,15 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmuludq
+; SSE-LABEL: test_x86_sse2_pmulu_dq:
+; SSE: ## BB#0:
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pmulu_dq:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -396,7 +742,15 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: psadbw
+; SSE-LABEL: test_x86_sse2_psad_bw:
+; SSE: ## BB#0:
+; SSE-NEXT: psadbw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psad_bw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -404,7 +758,15 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pslld
+; SSE-LABEL: test_x86_sse2_psll_d:
+; SSE: ## BB#0:
+; SSE-NEXT: pslld %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psll_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -412,7 +774,15 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: psllq
+; SSE-LABEL: test_x86_sse2_psll_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psllq %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psll_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -420,7 +790,15 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psllw
+; SSE-LABEL: test_x86_sse2_psll_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psllw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psll_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -428,7 +806,15 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
- ; CHECK: pslld
+; SSE-LABEL: test_x86_sse2_pslli_d:
+; SSE: ## BB#0:
+; SSE-NEXT: pslld $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pslli_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpslld $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -436,7 +822,15 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
- ; CHECK: psllq
+; SSE-LABEL: test_x86_sse2_pslli_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psllq $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pslli_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllq $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -444,7 +838,15 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
- ; CHECK: psllw
+; SSE-LABEL: test_x86_sse2_pslli_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psllw $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pslli_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsllw $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -452,7 +854,15 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: psrad
+; SSE-LABEL: test_x86_sse2_psra_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrad %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psra_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -460,7 +870,15 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psraw
+; SSE-LABEL: test_x86_sse2_psra_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psraw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psra_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -468,7 +886,15 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
- ; CHECK: psrad
+; SSE-LABEL: test_x86_sse2_psrai_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrad $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrai_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrad $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -476,7 +902,15 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
- ; CHECK: psraw
+; SSE-LABEL: test_x86_sse2_psrai_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psraw $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrai_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsraw $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -484,7 +918,15 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: psrld
+; SSE-LABEL: test_x86_sse2_psrl_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrld %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrl_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -492,7 +934,15 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: psrlq
+; SSE-LABEL: test_x86_sse2_psrl_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlq %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrl_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -500,7 +950,15 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psrlw
+; SSE-LABEL: test_x86_sse2_psrl_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrl_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -508,7 +966,15 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
- ; CHECK: psrld
+; SSE-LABEL: test_x86_sse2_psrli_d:
+; SSE: ## BB#0:
+; SSE-NEXT: psrld $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrli_d:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrld $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -516,7 +982,15 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
- ; CHECK: psrlq
+; SSE-LABEL: test_x86_sse2_psrli_q:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlq $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrli_q:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlq $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -524,7 +998,15 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
- ; CHECK: psrlw
+; SSE-LABEL: test_x86_sse2_psrli_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psrlw $7, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psrli_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsrlw $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -532,7 +1014,15 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: psubsb
+; SSE-LABEL: test_x86_sse2_psubs_b:
+; SSE: ## BB#0:
+; SSE-NEXT: psubsb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubs_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -540,7 +1030,15 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psubsw
+; SSE-LABEL: test_x86_sse2_psubs_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psubsw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubs_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -548,7 +1046,15 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: psubusb
+; SSE-LABEL: test_x86_sse2_psubus_b:
+; SSE: ## BB#0:
+; SSE-NEXT: psubusb %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubus_b:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -556,7 +1062,15 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: psubusw
+; SSE-LABEL: test_x86_sse2_psubus_w:
+; SSE: ## BB#0:
+; SSE-NEXT: psubusw %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_psubus_w:
+; KNL: ## BB#0:
+; KNL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -564,7 +1078,15 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
- ; CHECK: sqrtpd
+; SSE-LABEL: test_x86_sse2_sqrt_pd:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtpd %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_sqrt_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtpd %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -572,50 +1094,31 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
- ; CHECK: sqrtsd
+; SSE-LABEL: test_x86_sse2_sqrt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: sqrtsd %xmm0, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_sqrt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
- ; CHECK: test_x86_sse2_storel_dq
- ; CHECK: movl
- ; CHECK: movlps
- call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
- ret void
-}
-declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
-
-
-define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
- ; CHECK: test_x86_sse2_storeu_dq
- ; CHECK: movl
- ; CHECK: movdqu
- ; add operation forces the execution domain.
- %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
- call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
-
-
-define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
- ; CHECK: test_x86_sse2_storeu_pd
- ; CHECK: movl
- ; CHECK: movupd
- ; fadd operation forces the execution domain.
- %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
- call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
- ret void
-}
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
-
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: test_x86_sse2_sub_sd
- ; CHECK: subsd
+; SSE-LABEL: test_x86_sse2_sub_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: subsd %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_sub_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -623,9 +1126,23 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomieq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: setnp %al
+; SSE-NEXT: sete %cl
+; SSE-NEXT: andb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomieq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: setnp %al
+; KNL-NEXT: sete %cl
+; KNL-NEXT: andb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -633,9 +1150,19 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: setae
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomige_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomige_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -643,9 +1170,19 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomigt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomigt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -653,9 +1190,19 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: setbe
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomile_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm0, %xmm1
+; SSE-NEXT: setae %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomile_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm0, %xmm1
+; KNL-NEXT: setae %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -663,8 +1210,19 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: sbbl
+; SSE-LABEL: test_x86_sse2_ucomilt_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: ucomisd %xmm0, %xmm1
+; SSE-NEXT: seta %al
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomilt_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vucomisd %xmm0, %xmm1
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -672,44 +1230,39 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: ucomisd
- ; CHECK: setne
- ; CHECK: movzbl
+; SSE-LABEL: test_x86_sse2_ucomineq_sd:
+; SSE: ## BB#0:
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: setp %al
+; SSE-NEXT: setne %cl
+; SSE-NEXT: orb %al, %cl
+; SSE-NEXT: movzbl %cl, %eax
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_ucomineq_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vucomisd %xmm1, %xmm0
+; KNL-NEXT: setp %al
+; KNL-NEXT: setne %cl
+; KNL-NEXT: orb %al, %cl
+; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
define void @test_x86_sse2_pause() {
- ; CHECK: pause
+; SSE-LABEL: test_x86_sse2_pause:
+; SSE: ## BB#0:
+; SSE-NEXT: pause
+; SSE-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse2_pause:
+; KNL: ## BB#0:
+; KNL-NEXT: pause
+; KNL-NEXT: retl
tail call void @llvm.x86.sse2.pause()
- ret void
+ ret void
}
declare void @llvm.x86.sse2.pause() nounwind
-
-define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {
-; CHECK-LABEL: test_x86_sse2_pshuf_d:
-; CHECK: pshufd $27
-entry:
- %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone
-
-define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {
-; CHECK-LABEL: test_x86_sse2_pshufl_w:
-; CHECK: pshuflw $27
-entry:
- %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone
-
-define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {
-; CHECK-LABEL: test_x86_sse2_pshufh_w:
-; CHECK: pshufhw $27
-entry:
- %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index ed84905b1907..85e57e0dbdd1 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Tests for SSE2 and below, without SSE3+.
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+sse2 -O3 | FileCheck %s
define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; CHECK-LABEL: test1:
@@ -8,7 +8,7 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movapd (%ecx), %xmm0
-; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; CHECK-NEXT: movapd %xmm0, (%eax)
; CHECK-NEXT: retl
%tmp3 = load <2 x double>, <2 x double>* %A, align 16
@@ -24,7 +24,7 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movapd (%ecx), %xmm0
-; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; CHECK-NEXT: movapd %xmm0, (%eax)
; CHECK-NEXT: retl
%tmp3 = load <2 x double>, <2 x double>* %A, align 16
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index 79317e4576b9..3f47d987aeda 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -388,7 +388,7 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 1
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index c4da546ed77e..17586a811f40 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -121,8 +121,8 @@ define <16 x float> @test5(<16 x float> %A, <16 x float> %B) {
; AVX512: # BB#0:
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,17,2,19,4,21,6,23,8,25,10,27,12,29,14,31]
-; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm2[1,3],zmm0[4,6],zmm2[5,7],zmm0[8,10],zmm2[9,11],zmm0[12,14],zmm2[13,15]
+; AVX512-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
; AVX512-NEXT: retq
%add = fadd <16 x float> %A, %B
%sub = fsub <16 x float> %A, %B
@@ -149,8 +149,7 @@ define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
; AVX512: # BB#0:
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vsubpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,9,2,11,4,13,6,15]
-; AVX512-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm2[1],zmm0[2],zmm2[3],zmm0[4],zmm2[5],zmm0[6],zmm2[7]
; AVX512-NEXT: retq
%add = fadd <8 x double> %A, %B
%sub = fsub <8 x double> %A, %B
diff --git a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
index 217be9aeae3a..0111de2f5211 100644
--- a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse3-builtins.c
@@ -94,7 +94,7 @@ define <4 x float> @test_mm_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
}
declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
-define <2 x i64> @test_mm_lddqu_si128(i8* %a0) {
+define <2 x i64> @test_mm_lddqu_si128(<2 x i64>* %a0) {
; X32-LABEL: test_mm_lddqu_si128:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -105,7 +105,8 @@ define <2 x i64> @test_mm_lddqu_si128(i8* %a0) {
; X64: # BB#0:
; X64-NEXT: lddqu (%rdi), %xmm0
; X64-NEXT: retq
- %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
+ %bc = bitcast <2 x i64>* %a0 to i8*
+ %call = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %bc)
%res = bitcast <16 x i8> %call to <2 x i64>
ret <2 x i64> %res
}
@@ -115,12 +116,12 @@ define <2 x double> @test_mm_loaddup_pd(double* %a0) {
; X32-LABEL: test_mm_loaddup_pd:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movddup (%eax), %xmm0
+; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loaddup_pd:
; X64: # BB#0:
-; X64-NEXT: movddup (%rdi), %xmm0
+; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
%ld = load double, double* %a0
%res0 = insertelement <2 x double> undef, double %ld, i32 0
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 2c24478706e6..6d51fb54f8b8 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -140,7 +140,7 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
; X64-LABEL: t9:
; X64: ## BB#0:
; X64-NEXT: movapd (%rdi), %xmm0
-; X64-NEXT: movhpd (%rsi), %xmm0
+; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X64-NEXT: movapd %xmm0, (%rdi)
; X64-NEXT: retq
%tmp = load <4 x float>, <4 x float>* %r
@@ -207,7 +207,7 @@ define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X64: ## BB#0: ## %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
@@ -220,7 +220,7 @@ define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X64: ## BB#0: ## %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X64-NEXT: retq
entry:
%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
diff --git a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..16868d854df7
--- /dev/null
+++ b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -0,0 +1,1008 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse41-builtins.c
+
+define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_blend_epi16:
+; X32: # BB#0:
+; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_epi16:
+; X64: # BB#0:
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %shuf = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
+ %res = bitcast <8 x i16> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_blend_pd:
+; X32: # BB#0:
+; X32-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_pd:
+; X64: # BB#0:
+; X64-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X64-NEXT: retq
+ %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_blend_ps:
+; X32: # BB#0:
+; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blend_ps:
+; X64: # BB#0:
+; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; X64-NEXT: retq
+ %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_blendv_epi8:
+; X32: # BB#0:
+; X32-NEXT: movdqa %xmm0, %xmm3
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: pblendvb %xmm1, %xmm3
+; X32-NEXT: movdqa %xmm3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blendv_epi8:
+; X64: # BB#0:
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: pblendvb %xmm1, %xmm3
+; X64-NEXT: movdqa %xmm3, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %call = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2)
+ %res = bitcast <16 x i8> %call to <2 x i64>
+ ret <2 x i64> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; X32-LABEL: test_mm_blendv_pd:
+; X32: # BB#0:
+; X32-NEXT: movapd %xmm0, %xmm3
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: blendvpd %xmm1, %xmm3
+; X32-NEXT: movapd %xmm3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blendv_pd:
+; X64: # BB#0:
+; X64-NEXT: movapd %xmm0, %xmm3
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: blendvpd %xmm1, %xmm3
+; X64-NEXT: movapd %xmm3, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; X32-LABEL: test_mm_blendv_ps:
+; X32: # BB#0:
+; X32-NEXT: movaps %xmm0, %xmm3
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: blendvps %xmm1, %xmm3
+; X32-NEXT: movaps %xmm3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_blendv_ps:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, %xmm3
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: blendvps %xmm1, %xmm3
+; X64-NEXT: movaps %xmm3, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_ceil_pd:
+; X32: # BB#0:
+; X32-NEXT: roundpd $2, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_pd:
+; X64: # BB#0:
+; X64-NEXT: roundpd $2, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_ceil_ps:
+; X32: # BB#0:
+; X32-NEXT: roundps $2, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_ps:
+; X64: # BB#0:
+; X64-NEXT: roundps $2, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_ceil_sd:
+; X32: # BB#0:
+; X32-NEXT: roundsd $2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_sd:
+; X64: # BB#0:
+; X64-NEXT: roundsd $2, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_ceil_ss:
+; X32: # BB#0:
+; X32-NEXT: roundss $2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_ceil_ss:
+; X64: # BB#0:
+; X64-NEXT: roundss $2, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpeq_epi64:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpeq_epi64:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqq %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = icmp eq <2 x i64> %a0, %a1
+ %res = sext <2 x i1> %cmp to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi8_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmovsxbw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi8_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmovsxbw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %sext = sext <8 x i8> %ext0 to <8 x i16>
+ %res = bitcast <8 x i16> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi8_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovsxbd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi8_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovsxbd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = sext <4 x i8> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi8_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovsxbq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi8_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovsxbq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %sext = sext <2 x i8> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi16_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovsxwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi16_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovsxwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = sext <4 x i16> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi16_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovsxwq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi16_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovsxwq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %sext = sext <2 x i16> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepi32_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovsxdq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepi32_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovsxdq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %sext = sext <2 x i32> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu8_epi16:
+; X32: # BB#0:
+; X32-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu8_epi16:
+; X64: # BB#0:
+; X64-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %sext = zext <8 x i8> %ext0 to <8 x i16>
+ %res = bitcast <8 x i16> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu8_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu8_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = zext <4 x i8> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu8_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu8_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %sext = zext <2 x i8> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu16_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu16_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %sext = zext <4 x i16> %ext0 to <4 x i32>
+ %res = bitcast <4 x i32> %sext to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu16_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu16_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %sext = zext <2 x i16> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_cvtepu32_epi64:
+; X32: # BB#0:
+; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cvtepu32_epi64:
+; X64: # BB#0:
+; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %sext = zext <2 x i32> %ext0 to <2 x i64>
+ ret <2 x i64> %sext
+}
+
+define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_dp_pd:
+; X32: # BB#0:
+; X32-NEXT: dppd $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_dp_pd:
+; X64: # BB#0:
+; X64-NEXT: dppd $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_dp_ps:
+; X32: # BB#0:
+; X32-NEXT: dpps $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_dp_ps:
+; X64: # BB#0:
+; X64-NEXT: dpps $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define i32 @test_mm_extract_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_extract_epi8:
+; X32: # BB#0:
+; X32-NEXT: pextrb $1, %xmm0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi8:
+; X64: # BB#0:
+; X64-NEXT: pextrb $1, %xmm0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %ext = extractelement <16 x i8> %arg0, i32 1
+ %res = zext i8 %ext to i32
+ ret i32 %res
+}
+
+define i32 @test_mm_extract_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_extract_epi32:
+; X32: # BB#0:
+; X32-NEXT: pextrd $1, %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi32:
+; X64: # BB#0:
+; X64-NEXT: pextrd $1, %xmm0, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = extractelement <4 x i32> %arg0, i32 1
+ ret i32 %ext
+}
+
+define i64 @test_mm_extract_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_extract_epi64:
+; X32: # BB#0:
+; X32-NEXT: pextrd $2, %xmm0, %eax
+; X32-NEXT: pextrd $3, %xmm0, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_epi64:
+; X64: # BB#0:
+; X64-NEXT: pextrq $1, %xmm0, %rax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %ext = extractelement <2 x i64> %a0, i32 1
+ ret i64 %ext
+}
+
+; TODO test_mm_extract_ps
+
+define <2 x double> @test_mm_floor_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_floor_pd:
+; X32: # BB#0:
+; X32-NEXT: roundpd $1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_pd:
+; X64: # BB#0:
+; X64-NEXT: roundpd $1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_floor_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_floor_ps:
+; X32: # BB#0:
+; X32-NEXT: roundps $1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_ps:
+; X64: # BB#0:
+; X64-NEXT: roundps $1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1)
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_floor_sd:
+; X32: # BB#0:
+; X32-NEXT: roundsd $1, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_sd:
+; X64: # BB#0:
+; X64-NEXT: roundsd $1, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_floor_ss:
+; X32: # BB#0:
+; X32-NEXT: roundss $1, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_floor_ss:
+; X64: # BB#0:
+; X64-NEXT: roundss $1, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1)
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) {
+; X32-LABEL: test_mm_insert_epi8:
+; X32: # BB#0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pinsrb $1, %eax, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi8:
+; X64: # BB#0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: pinsrb $1, %eax, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) {
+; X32-LABEL: test_mm_insert_epi32:
+; X32: # BB#0:
+; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi32:
+; X64: # BB#0:
+; X64-NEXT: pinsrd $1, %edi, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = insertelement <4 x i32> %arg0, i32 %a1,i32 1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) {
+; X32-LABEL: test_mm_insert_epi64:
+; X32: # BB#0:
+; X32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_epi64:
+; X64: # BB#0:
+; X64-NEXT: pinsrq $1, %rdi, %xmm0
+; X64-NEXT: retq
+ %res = insertelement <2 x i64> %a0, i64 %a1,i32 1
+ ret <2 x i64> %res
+}
+
+define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_insert_ps:
+; X32: # BB#0:
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_insert_ps:
+; X64: # BB#0:
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epi8:
+; X32: # BB#0:
+; X32-NEXT: pmaxsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epi8:
+; X64: # BB#0:
+; X64-NEXT: pmaxsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp sgt <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmaxsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmaxsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp sgt <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epu16:
+; X32: # BB#0:
+; X32-NEXT: pmaxuw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epu16:
+; X64: # BB#0:
+; X64-NEXT: pmaxuw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp ugt <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_max_epu32:
+; X32: # BB#0:
+; X32-NEXT: pmaxud %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_max_epu32:
+; X64: # BB#0:
+; X64-NEXT: pmaxud %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp ugt <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epi8:
+; X32: # BB#0:
+; X32-NEXT: pminsb %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epi8:
+; X64: # BB#0:
+; X64-NEXT: pminsb %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %cmp = icmp slt <16 x i8> %arg0, %arg1
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
+ %bc = bitcast <16 x i8> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epi32:
+; X32: # BB#0:
+; X32-NEXT: pminsd %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epi32:
+; X64: # BB#0:
+; X64-NEXT: pminsd %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp slt <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epu16:
+; X32: # BB#0:
+; X32-NEXT: pminuw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epu16:
+; X64: # BB#0:
+; X64-NEXT: pminuw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %cmp = icmp ult <8 x i16> %arg0, %arg1
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
+ %bc = bitcast <8 x i16> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_min_epu32:
+; X32: # BB#0:
+; X32-NEXT: pminud %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_min_epu32:
+; X64: # BB#0:
+; X64-NEXT: pminud %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %cmp = icmp ult <4 x i32> %arg0, %arg1
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
+ %bc = bitcast <4 x i32> %sel to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_minpos_epu16:
+; X32: # BB#0:
+; X32-NEXT: phminposuw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_minpos_epu16:
+; X64: # BB#0:
+; X64-NEXT: phminposuw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mpsadbw_epu8:
+; X32: # BB#0:
+; X32-NEXT: mpsadbw $1, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mpsadbw_epu8:
+; X64: # BB#0:
+; X64-NEXT: mpsadbw $1, %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %arg0, <16 x i8> %arg1, i8 1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mul_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmuldq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mul_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmuldq %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %arg0, <4 x i32> %arg1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_mullo_epi32:
+; X32: # BB#0:
+; X32-NEXT: pmulld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mullo_epi32:
+; X64: # BB#0:
+; X64-NEXT: pmulld %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = mul <4 x i32> %arg0, %arg1
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_packus_epi32:
+; X32: # BB#0:
+; X32-NEXT: packusdw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_packus_epi32:
+; X64: # BB#0:
+; X64-NEXT: packusdw %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_round_pd:
+; X32: # BB#0:
+; X32-NEXT: roundpd $4, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_pd:
+; X64: # BB#0:
+; X64-NEXT: roundpd $4, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_round_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_round_ps:
+; X32: # BB#0:
+; X32-NEXT: roundps $4, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_ps:
+; X64: # BB#0:
+; X64-NEXT: roundps $4, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: test_mm_round_sd:
+; X32: # BB#0:
+; X32-NEXT: roundsd $4, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_sd:
+; X64: # BB#0:
+; X64-NEXT: roundsd $4, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: test_mm_round_ss:
+; X32: # BB#0:
+; X32-NEXT: roundss $4, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_round_ss:
+; X64: # BB#0:
+; X64-NEXT: roundss $4, %xmm1, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4)
+ ret <4 x float> %res
+}
+
+define <2 x i64> @test_mm_stream_load_si128(<2 x i64>* %a0) {
+; X32-LABEL: test_mm_stream_load_si128:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntdqa (%eax), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_stream_load_si128:
+; X64: # BB#0:
+; X64-NEXT: movntdqa (%rdi), %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64>* %a0 to i8*
+ %res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
+
+define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
+; X32-LABEL: test_mm_test_all_ones:
+; X32: # BB#0:
+; X32-NEXT: pcmpeqd %xmm1, %xmm1
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_all_ones:
+; X64: # BB#0:
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_test_all_zeros:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_all_zeros:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+
+define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_test_mix_ones_zeros:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_mix_ones_zeros:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_testc_si128:
+; X32: # BB#0:
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testc_si128:
+; X64: # BB#0:
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+
+define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_testnzc_si128:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testnzc_si128:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
+
+define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_testz_si128:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: ptest %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testz_si128:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: ptest %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
+ ret i32 %res
+}
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index 75f69ffd6db9..4f6aa798faf0 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -1,17 +1,25 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse4.1 | FileCheck %s
+
; This test works just like the non-upgrade one except that it only checks
; forms which require auto-upgrading.
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: blendpd
- %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+; CHECK-LABEL: test_x86_sse41_blendpd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT: retl
+ %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: blendps
+; CHECK-LABEL: test_x86_sse41_blendps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -19,7 +27,10 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounw
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: dppd
+; CHECK-LABEL: test_x86_sse41_dppd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: dppd $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -27,7 +38,10 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounw
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: dpps
+; CHECK-LABEL: test_x86_sse41_dpps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: dpps $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -35,15 +49,21 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: insertps
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+; CHECK-LABEL: test_x86_sse41_insertps:
+; CHECK: ## BB#0:
+; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
+; CHECK-NEXT: retl
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 17) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: mpsadbw
+; CHECK-LABEL: test_x86_sse41_mpsadbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: mpsadbw $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -51,7 +71,10 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind re
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pblendw
+; CHECK-LABEL: test_x86_sse41_pblendw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -59,7 +82,10 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind re
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
- ; CHECK: pmovsxbd
+; CHECK-LABEL: test_x86_sse41_pmovsxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxbd %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -67,7 +93,10 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
- ; CHECK: pmovsxbq
+; CHECK-LABEL: test_x86_sse41_pmovsxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxbq %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -75,7 +104,10 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
- ; CHECK: pmovsxbw
+; CHECK-LABEL: test_x86_sse41_pmovsxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxbw %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -83,7 +115,10 @@ declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
- ; CHECK: pmovsxdq
+; CHECK-LABEL: test_x86_sse41_pmovsxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -91,7 +126,10 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
- ; CHECK: pmovsxwd
+; CHECK-LABEL: test_x86_sse41_pmovsxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxwd %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -99,8 +137,166 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
- ; CHECK: pmovsxwq
+; CHECK-LABEL: test_x86_sse41_pmovsxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovsxwq %xmm0, %xmm0
+; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxbw:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: retl
+ %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxdq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwd:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: retl
+ %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
+; CHECK-LABEL: test_x86_sse41_pmovzxwq:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; CHECK-NEXT: retl
+ %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+define <16 x i8> @max_epi8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: max_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxsb %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @min_epi8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: min_epi8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminsb %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @max_epu16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: max_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxuw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @min_epu16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: min_epu16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminuw %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @max_epi32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: max_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxsd %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @min_epi32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: min_epi32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminsd %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @max_epu32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: max_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pmaxud %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @min_epu32(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: min_epu32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pminud %xmm1, %xmm0
+; CHECK-NEXT: retl
+;
+ %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index ceff4f9782e9..58eae1057f89 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -1,24 +1,20 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s
-
-define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: blendpd
- %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: blendps
- %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: blendvpd
+; SSE41-LABEL: test_x86_sse41_blendvpd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movapd %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvpd %xmm1, %xmm3
+; SSE41-NEXT: movapd %xmm3, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_blendvpd:
+; KNL: ## BB#0:
+; KNL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -26,7 +22,18 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: blendvps
+; SSE41-LABEL: test_x86_sse41_blendvps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm1, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_blendvps:
+; KNL: ## BB#0:
+; KNL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -34,7 +41,15 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: dppd
+; SSE41-LABEL: test_x86_sse41_dppd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: dppd $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_dppd:
+; KNL: ## BB#0:
+; KNL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -42,7 +57,15 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: dpps
+; SSE41-LABEL: test_x86_sse41_dpps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: dpps $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_dpps:
+; KNL: ## BB#0:
+; KNL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -50,8 +73,16 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: insertps
- %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+; SSE41-LABEL: test_x86_sse41_insertps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_insertps:
+; KNL: ## BB#0:
+; KNL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
+; KNL-NEXT: retl
+ %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
@@ -59,7 +90,15 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: mpsadbw
+; SSE41-LABEL: test_x86_sse41_mpsadbw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: mpsadbw $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_mpsadbw:
+; KNL: ## BB#0:
+; KNL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -67,7 +106,15 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: packusdw
+; SSE41-LABEL: test_x86_sse41_packusdw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_packusdw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -75,23 +122,34 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno
define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
- ; CHECK: pblendvb
+; SSE41-LABEL: test_x86_sse41_pblendvb:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pblendvb:
+; KNL: ## BB#0:
+; KNL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pblendw
- %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
-
-
define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
- ; CHECK: phminposuw
+; SSE41-LABEL: test_x86_sse41_phminposuw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_phminposuw:
+; KNL: ## BB#0:
+; KNL-NEXT: vphminposuw %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -99,7 +157,15 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pmaxsb
+; SSE41-LABEL: test_x86_sse41_pmaxsb:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxsb %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxsb:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -107,7 +173,15 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmaxsd
+; SSE41-LABEL: test_x86_sse41_pmaxsd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxsd %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxsd:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -115,7 +189,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmaxud
+; SSE41-LABEL: test_x86_sse41_pmaxud:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxud:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -123,7 +205,15 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pmaxuw
+; SSE41-LABEL: test_x86_sse41_pmaxuw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmaxuw %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmaxuw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -131,7 +221,15 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pminsb
+; SSE41-LABEL: test_x86_sse41_pminsb:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminsb %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminsb:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -139,7 +237,15 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pminsd
+; SSE41-LABEL: test_x86_sse41_pminsd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminsd %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminsd:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -147,7 +253,15 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pminud
+; SSE41-LABEL: test_x86_sse41_pminud:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminud %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminud:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -155,63 +269,31 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: pminuw
+; SSE41-LABEL: test_x86_sse41_pminuw:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pminuw:
+; KNL: ## BB#0:
+; KNL-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
-define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
- ; CHECK: pmovzxbd
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
- ; CHECK: pmovzxbq
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
- ; CHECK: pmovzxbw
- %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
- ; CHECK: pmovzxdq
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
- ; CHECK: pmovzxwd
- %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
- ; CHECK: pmovzxwq
- %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
-
-
define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: pmuldq
+; SSE41-LABEL: test_x86_sse41_pmuldq:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pmuldq %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_pmuldq:
+; KNL: ## BB#0:
+; KNL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -219,8 +301,19 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: ptest
- ; CHECK: sbbl
+; SSE41-LABEL: test_x86_sse41_ptestc:
+; SSE41: ## BB#0:
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: sbbl %eax, %eax
+; SSE41-NEXT: andl $1, %eax
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_ptestc:
+; KNL: ## BB#0:
+; KNL-NEXT: vptest %xmm1, %xmm0
+; KNL-NEXT: sbbl %eax, %eax
+; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -228,9 +321,19 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: ptest
- ; CHECK: seta
- ; CHECK: movzbl
+; SSE41-LABEL: test_x86_sse41_ptestnzc:
+; SSE41: ## BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: seta %al
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_ptestnzc:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vptest %xmm1, %xmm0
+; KNL-NEXT: seta %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -238,9 +341,19 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: ptest
- ; CHECK: sete
- ; CHECK: movzbl
+; SSE41-LABEL: test_x86_sse41_ptestz:
+; SSE41: ## BB#0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: sete %al
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_ptestz:
+; KNL: ## BB#0:
+; KNL-NEXT: xorl %eax, %eax
+; KNL-NEXT: vptest %xmm1, %xmm0
+; KNL-NEXT: sete %al
+; KNL-NEXT: retl
%res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -248,7 +361,15 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
- ; CHECK: roundpd
+; SSE41-LABEL: test_x86_sse41_round_pd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $7, %xmm0, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_pd:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundpd $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -256,7 +377,15 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
- ; CHECK: roundps
+; SSE41-LABEL: test_x86_sse41_round_ps:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $7, %xmm0, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_ps:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundps $7, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -264,7 +393,15 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
- ; CHECK: roundsd
+; SSE41-LABEL: test_x86_sse41_round_sd:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundsd $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_sd:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -272,7 +409,15 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
- ; CHECK: roundss
+; SSE41-LABEL: test_x86_sse41_round_ss:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundss $7, %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; KNL-LABEL: test_x86_sse41_round_ss:
+; KNL: ## BB#0:
+; KNL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
diff --git a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll b/test/CodeGen/X86/sse41-pmovxrm.ll
index a7e48d8ac038..756beb995c06 100644
--- a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll
+++ b/test/CodeGen/X86/sse41-pmovxrm.ll
@@ -109,8 +109,9 @@ define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) {
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %1)
- ret <8 x i16> %2
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = zext <8 x i8> %2 to <8 x i16>
+ ret <8 x i16> %3
}
define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
@@ -124,8 +125,9 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %1)
- ret <4 x i32> %2
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i8> %2 to <4 x i32>
+ ret <4 x i32> %3
}
define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
@@ -139,8 +141,9 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
- %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %1)
- ret <2 x i64> %2
+ %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+ %3 = zext <2 x i8> %2 to <2 x i64>
+ ret <2 x i64> %3
}
define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
@@ -154,8 +157,9 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %1)
- ret <4 x i32> %2
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i16> %2 to <4 x i32>
+ ret <4 x i32> %3
}
define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
@@ -169,8 +173,9 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
- %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %1)
- ret <2 x i64> %2
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %3 = zext <2 x i16> %2 to <2 x i64>
+ ret <2 x i64> %3
}
define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) {
@@ -184,13 +189,7 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) {
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; AVX-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a, align 1
- %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %1)
- ret <2 x i64> %2
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = zext <2 x i32> %2 to <2 x i64>
+ ret <2 x i64> %3
}
-
-declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>)
-declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>)
-declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>)
-declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>)
-declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>)
-declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 0a83a9753b81..3cb754c8f93f 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -141,14 +141,14 @@ define i32 @ext_3(<4 x i32> %v) nounwind {
define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
; X32-LABEL: insertps_1:
; X32: ## BB#0:
-; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_1:
; X64: ## BB#0:
-; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
; X64-NEXT: retq
- %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
+ %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
ret <4 x float> %tmp1
}
@@ -208,16 +208,16 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
; X32-LABEL: ptestz_1:
; X32: ## BB#0:
+; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: sete %al
-; X32-NEXT: movzbl %al, %eax
; X32-NEXT: retl
;
; X64-LABEL: ptestz_1:
; X64: ## BB#0:
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: sete %al
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
%tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
ret i32 %tmp1
@@ -244,16 +244,16 @@ define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
; X32-LABEL: ptestz_3:
; X32: ## BB#0:
+; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: seta %al
-; X32-NEXT: movzbl %al, %eax
; X32-NEXT: retl
;
; X64-LABEL: ptestz_3:
; X64: ## BB#0:
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: seta %al
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
%tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
ret i32 %tmp1
@@ -507,16 +507,12 @@ define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_X00A:
; X32: ## BB#0:
-; X32-NEXT: xorps %xmm2, %xmm2
-; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: shuf_X00A:
; X64: ## BB#0:
-; X64-NEXT: xorps %xmm2, %xmm2
-; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -701,16 +697,16 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_X00X:
; X32: ## BB#0:
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
-; X32-NEXT: pxor %xmm0, %xmm0
-; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; X32-NEXT: pxor %xmm1, %xmm1
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_X00X:
; X64: ## BB#0:
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
@@ -850,16 +846,12 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadf32:
; X64: ## BB#0:
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = getelementptr inbounds float, float* %fb, i64 %index
%2 = load float, float* %1, align 4
@@ -875,16 +867,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
; X32-LABEL: insertps_from_broadcast_loadv4f32:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movups (%eax), %xmm1
-; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadv4f32:
; X64: ## BB#0:
-; X64-NEXT: movups (%rdi), %xmm1
-; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b, align 4
%2 = extractelement <4 x float> %1, i32 0
@@ -896,14 +884,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
ret <4 x float> %7
}
-;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
; X32-LABEL: insertps_from_broadcast_multiple_use:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
@@ -916,7 +902,6 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X64-LABEL: insertps_from_broadcast_multiple_use:
; X64: ## BB#0:
; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..0a69d2632123
--- /dev/null
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse42-builtins.c
+
+define i64 @test_mm_crc64_u8(i64 %a0, i8 %a1) nounwind{
+; X64-LABEL: test_mm_crc64_u8:
+; X64: # BB#0:
+; X64-NEXT: crc32b %sil, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind readnone
+
+define i64 @test_mm_crc64_u64(i64 %a0, i64 %a1) nounwind{
+; X64-LABEL: test_mm_crc64_u64:
+; X64: # BB#0:
+; X64-NEXT: crc32q %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %res = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
+ ret i64 %res
+}
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..53b94e7f0d39
--- /dev/null
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -0,0 +1,401 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse42-builtins.c
+
+define i32 @test_mm_cmpestra(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestra:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: seta %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestra:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: seta %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestrc(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
+; X32-LABEL: test_mm_cmpestrc:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrc:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestri(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
+; X32-LABEL: test_mm_cmpestri:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestri:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <2 x i64> @test_mm_cmpestrm(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
+; X32-LABEL: test_mm_cmpestrm:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pcmpestrm $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrm:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestrm $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestro(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestro:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: seto %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestro:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: seto %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestrs(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestrs:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: sets %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrs:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: sets %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define i32 @test_mm_cmpestrz(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
+; X32-LABEL: test_mm_cmpestrz:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X32-NEXT: sete %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpestrz:
+; X64: # BB#0:
+; X64-NEXT: xorl %r8d, %r8d
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edx
+; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
+; X64-NEXT: sete %r8b
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %arg0, i32 %a1, <16 x i8> %arg2, i32 %a3, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+define <2 x i64> @test_mm_cmpgt_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpgt_epi64:
+; X32: # BB#0:
+; X32-NEXT: pcmpgtq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpgt_epi64:
+; X64: # BB#0:
+; X64-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-NEXT: retq
+ %cmp = icmp sgt <2 x i64> %a0, %a1
+ %res = sext <2 x i1> %cmp to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define i32 @test_mm_cmpistra(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistra:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: seta %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistra:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: seta %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistrc(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrc:
+; X32: # BB#0:
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: sbbl %eax, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrc:
+; X64: # BB#0:
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistri(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistri:
+; X32: # BB#0:
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistri:
+; X64: # BB#0:
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_cmpistrm(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrm:
+; X32: # BB#0:
+; X32-NEXT: pcmpistrm $7, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrm:
+; X64: # BB#0:
+; X64-NEXT: pcmpistrm $7, %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistro(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistro:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: seto %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistro:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: seto %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistrs(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrs:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: sets %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrs:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: sets %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_cmpistrz(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_cmpistrz:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X32-NEXT: sete %al
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmpistrz:
+; X64: # BB#0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
+; X64-NEXT: sete %al
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %arg0, <16 x i8> %arg1, i8 7)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define i32 @test_mm_crc32_u8(i32 %a0, i8 %a1) {
+; X32-LABEL: test_mm_crc32_u8:
+; X32: # BB#0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32b %cl, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_crc32_u8:
+; X64: # BB#0:
+; X64-NEXT: crc32b %sil, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone
+
+define i32 @test_mm_crc32_u16(i32 %a0, i16 %a1) {
+; X32-LABEL: test_mm_crc32_u16:
+; X32: # BB#0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32w %cx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_crc32_u16:
+; X64: # BB#0:
+; X64-NEXT: crc32w %si, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind readnone
+
+define i32 @test_mm_crc32_u32(i32 %a0, i32 %a1) {
+; X32-LABEL: test_mm_crc32_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_crc32_u32:
+; X64: # BB#0:
+; X64-NEXT: crc32l %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %res = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
+ ret i32 %res
+}
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86.ll b/test/CodeGen/X86/sse42-intrinsics-x86.ll
index 706c86b71a4a..2b31109ce45c 100644
--- a/test/CodeGen/X86/sse42-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -1,10 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 | FileCheck %s
define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl $7
- ; CHECK: movl $7
- ; CHECK: pcmpestri $7
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpestri128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -12,10 +16,16 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
- ; CHECK: movl $7
- ; CHECK: movl $7
- ; CHECK: pcmpestri $7, (
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpestri128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movdqa (%eax), %xmm0
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestri $7, (%ecx), %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a2
%res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
@@ -23,11 +33,18 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
}
-define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: seta
+define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestria128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: seta %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -35,43 +52,68 @@ declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: sbbl
+; CHECK-LABEL: test_x86_sse42_pcmpestric128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: seto
+define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestrio128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: seto %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: sets
+define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestris128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: sets %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestri
- ; CHECK: sete
+define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
+; CHECK-LABEL: test_x86_sse42_pcmpestriz128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: pcmpestri $7, %xmm1, %xmm0
+; CHECK-NEXT: sete %bl
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -79,10 +121,12 @@ declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
- ; CHECK: movl
- ; CHECK: movl
- ; CHECK: pcmpestrm
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpestrm128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestrm $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -90,10 +134,13 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
- ; CHECK: movl $7
- ; CHECK: movl $7
- ; CHECK: pcmpestrm $7,
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpestrm128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestrm $7, (%ecx), %xmm0
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a2
%res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
@@ -101,8 +148,11 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2
define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri $7
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpistri128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -110,8 +160,14 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
- ; CHECK: pcmpistri $7, (
- ; CHECK: movl
+; CHECK-LABEL: test_x86_sse42_pcmpistri128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: movdqa (%ecx), %xmm0
+; CHECK-NEXT: pcmpistri $7, (%eax), %xmm0
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a0
%2 = load <16 x i8>, <16 x i8>* %a1
%res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
@@ -120,8 +176,12 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: seta
+; CHECK-LABEL: test_x86_sse42_pcmpistria128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -129,8 +189,12 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: sbbl
+; CHECK-LABEL: test_x86_sse42_pcmpistric128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -138,8 +202,12 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: seto
+; CHECK-LABEL: test_x86_sse42_pcmpistrio128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: seto %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -147,8 +215,12 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: sets
+; CHECK-LABEL: test_x86_sse42_pcmpistris128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: sets %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -156,8 +228,12 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistri
- ; CHECK: sete
+; CHECK-LABEL: test_x86_sse42_pcmpistriz128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: pcmpistri $7, %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retl
%res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
ret i32 %res
}
@@ -165,8 +241,10 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: pcmpistrm $7
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpistrm128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pcmpistrm $7, %xmm1, %xmm0
+; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
@@ -174,8 +252,11 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
- ; CHECK: pcmpistrm $7, (
- ; CHECK-NOT: vmov
+; CHECK-LABEL: test_x86_sse42_pcmpistrm128_load:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: pcmpistrm $7, (%eax), %xmm0
+; CHECK-NEXT: retl
%1 = load <16 x i8>, <16 x i8>* %a1
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
diff --git a/test/CodeGen/X86/sse42.ll b/test/CodeGen/X86/sse42.ll
index db51d9973688..2d05f9884c42 100644
--- a/test/CodeGen/X86/sse42.ll
+++ b/test/CodeGen/X86/sse42.ll
@@ -1,39 +1,58 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s --check-prefix=X64
declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
+; X32-LABEL: crc32_32_8:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32b {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: crc32_32_8:
+; X64: ## BB#0:
+; X64-NEXT: crc32b %sil, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
ret i32 %tmp
-; X32: _crc32_32_8:
-; X32: crc32b 8(%esp), %eax
-
-; X64: _crc32_32_8:
-; X64: crc32b %sil,
}
define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
+; X32-LABEL: crc32_32_16:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32w {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: crc32_32_16:
+; X64: ## BB#0:
+; X64-NEXT: crc32w %si, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
ret i32 %tmp
-; X32: _crc32_32_16:
-; X32: crc32w 8(%esp), %eax
-
-; X64: _crc32_32_16:
-; X64: crc32w %si,
}
define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
+; X32-LABEL: crc32_32_32:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: crc32l {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: crc32_32_32:
+; X64: ## BB#0:
+; X64-NEXT: crc32l %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
ret i32 %tmp
-; X32: _crc32_32_32:
-; X32: crc32l 8(%esp), %eax
-
-; X64: _crc32_32_32:
-; X64: crc32l %esi,
}
diff --git a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
index f93a16a5eb3d..f45abf1d85df 100644
--- a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse4a-builtins.c
@@ -65,7 +67,7 @@ define <2 x i64> @test_mm_insert_si64(<2 x i64> %x, <2 x i64> %y) {
}
declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind readnone
-define void @test_stream_sd(i8* %p, <2 x double> %a) {
+define void @test_stream_sd(double* %p, <2 x double> %a) {
; X32-LABEL: test_stream_sd:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -76,12 +78,12 @@ define void @test_stream_sd(i8* %p, <2 x double> %a) {
; X64: # BB#0:
; X64-NEXT: movntsd %xmm0, (%rdi)
; X64-NEXT: retq
- call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
+ %1 = extractelement <2 x double> %a, i64 0
+ store double %1, double* %p, align 1, !nontemporal !1
ret void
}
-declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) nounwind readnone
-define void @test_mm_stream_ss(i8* %p, <4 x float> %a) {
+define void @test_mm_stream_ss(float* %p, <4 x float> %a) {
; X32-LABEL: test_mm_stream_ss:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -92,7 +94,9 @@ define void @test_mm_stream_ss(i8* %p, <4 x float> %a) {
; X64: # BB#0:
; X64-NEXT: movntss %xmm0, (%rdi)
; X64-NEXT: retq
- call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
+ %1 = extractelement <4 x float> %a, i64 0
+ store float %1, float* %p, align 1, !nontemporal !1
ret void
}
-declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) nounwind readnone
+
+!1 = !{i32 1}
diff --git a/test/CodeGen/X86/sse4a-upgrade.ll b/test/CodeGen/X86/sse4a-upgrade.ll
new file mode 100644
index 000000000000..a129c658f4b9
--- /dev/null
+++ b/test/CodeGen/X86/sse4a-upgrade.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
+
+define void @test_movntss(i8* %p, <4 x float> %a) nounwind optsize ssp {
+; X32-LABEL: test_movntss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_movntss:
+; X64: # BB#0:
+; X64-NEXT: movntss %xmm0, (%rdi)
+; X64-NEXT: retq
+ tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
+ ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
+define void @test_movntsd(i8* %p, <2 x double> %a) nounwind optsize ssp {
+; X32-LABEL: test_movntsd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_movntsd:
+; X64: # BB#0:
+; X64-NEXT: movntsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
+ ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll
index f8fa125f98e7..1f582fb4ed9d 100644
--- a/test/CodeGen/X86/sse4a.ll
+++ b/test/CodeGen/X86/sse4a.ll
@@ -1,36 +1,35 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4a | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-linux -mattr=sse4a | FileCheck %s
-
-define void @test1(i8* %p, <4 x float> %a) nounwind optsize ssp {
-; CHECK-LABEL: test1:
-; CHECK: movntss
- tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
- ret void
-}
-
-declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
-
-define void @test2(i8* %p, <2 x double> %a) nounwind optsize ssp {
-; CHECK-LABEL: test2:
-; CHECK: movntsd
- tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
- ret void
-}
-
-declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
-define <2 x i64> @test3(<2 x i64> %x) nounwind uwtable ssp {
-; CHECK-LABEL: test3:
-; CHECK: extrq
+define <2 x i64> @test_extrqi(<2 x i64> %x) nounwind uwtable ssp {
+; X32-LABEL: test_extrqi:
+; X32: # BB#0:
+; X32-NEXT: extrq $2, $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_extrqi:
+; X64: # BB#0:
+; X64-NEXT: extrq $2, $3, %xmm0
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
ret <2 x i64> %1
}
declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
-define <2 x i64> @test4(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
-; CHECK-LABEL: test4:
-; CHECK: extrq
+define <2 x i64> @test_extrq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; X32-LABEL: test_extrq:
+; X32: # BB#0:
+; X32-NEXT: extrq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_extrq:
+; X64: # BB#0:
+; X64-NEXT: extrq %xmm1, %xmm0
+; X64-NEXT: retq
%1 = bitcast <2 x i64> %y to <16 x i8>
%2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
ret <2 x i64> %2
@@ -38,18 +37,32 @@ define <2 x i64> @test4(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
-define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
-; CHECK-LABEL: test5:
-; CHECK: insertq
+define <2 x i64> @test_insertqi(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; X32-LABEL: test_insertqi:
+; X32: # BB#0:
+; X32-NEXT: insertq $6, $5, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_insertqi:
+; X64: # BB#0:
+; X64-NEXT: insertq $6, $5, %xmm1, %xmm0
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
ret <2 x i64> %1
}
declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
-define <2 x i64> @test6(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
-; CHECK-LABEL: test6:
-; CHECK: insertq
+define <2 x i64> @test_insertq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; X32-LABEL: test_insertq:
+; X32: # BB#0:
+; X32-NEXT: insertq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_insertq:
+; X64: # BB#0:
+; X64-NEXT: insertq %xmm1, %xmm0
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
ret <2 x i64> %1
}
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
index 8d61428420f6..51359d1790af 100644
--- a/test/CodeGen/X86/sse_partial_update.ll
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
; rdar: 12558838
@@ -8,14 +9,15 @@
; destination of each scalar unary op are the same.
define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: rsqrtss:
-; CHECK: rsqrtss %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movshdup
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: rsqrtss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
+; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee ## TAILCALL
+entry:
%0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
@@ -29,14 +31,15 @@ declare void @callee(double, double)
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define void @rcpss(<4 x float> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: rcpss:
-; CHECK: rcpss %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movshdup
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: rcpss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
+; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee ## TAILCALL
+entry:
%0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
@@ -49,14 +52,15 @@ entry:
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: sqrtss:
-; CHECK: sqrtss %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movshdup
-; CHECK-NEXT: cvtss2sd %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: sqrtss %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
+; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee ## TAILCALL
+entry:
%0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %0, i32 0
@@ -69,14 +73,15 @@ entry:
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
-entry:
; CHECK-LABEL: sqrtsd:
-; CHECK: sqrtsd %xmm0, %xmm0
-; CHECK-NEXT: cvtsd2ss %xmm0
-; CHECK-NEXT: shufpd
-; CHECK-NEXT: cvtsd2ss %xmm0
-; CHECK-NEXT: movap
-; CHECK-NEXT: jmp
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: sqrtsd %xmm0, %xmm0
+; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1
+; CHECK-NEXT: movaps %xmm2, %xmm0
+; CHECK-NEXT: jmp _callee2 ## TAILCALL
+entry:
%0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind
%a0 = extractelement <2 x double> %0, i32 0
@@ -92,10 +97,11 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
; CHECK-LABEL: load_fold_cvtss2sd_int:
-; CHECK: movaps (%rdi), %xmm1
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK: ## BB#0:
+; CHECK-NEXT: movaps (%rdi), %xmm1
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd %xmm1, %xmm0
+; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
@@ -103,9 +109,10 @@ define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_optsize:
-; CHECK: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
-; CHECK-NEXT: retq
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
+; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
@@ -113,9 +120,10 @@ define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
define <2 x double> @load_fold_cvtss2sd_int_minsize(<4 x float> *%a) minsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_minsize:
-; CHECK: xorps %xmm0, %xmm0
-; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
-; CHECK-NEXT: retq
+; CHECK: ## BB#0:
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
+; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
diff --git a/test/CodeGen/X86/ssp-data-layout.ll b/test/CodeGen/X86/ssp-data-layout.ll
index 4a63aceb7ccf..e954d9c1042a 100644
--- a/test/CodeGen/X86/ssp-data-layout.ll
+++ b/test/CodeGen/X86/ssp-data-layout.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
; This test is fairly fragile. The goal is to ensure that "large" stack
; objects are allocated closest to the stack protector (i.e., farthest away
; from the Stack Pointer.) In standard SSP mode this means that large (>=
diff --git a/test/CodeGen/X86/ssp-guard-spill.ll b/test/CodeGen/X86/ssp-guard-spill.ll
new file mode 100644
index 000000000000..7364dee4f080
--- /dev/null
+++ b/test/CodeGen/X86/ssp-guard-spill.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "x86_64-apple-macosx10.4.0"
+
+; __stack_chk_guard must be loaded for twice, once for setting up the canary,
+; another time for performing the check. It is because if we reuse the same
+; stack guard value, it may get spilled to the stack, then the for loop may
+; corrupt it.
+;
+; bool Bar(int*);
+; bool Foo(int n) {
+; int a[10];
+; for (int i = 0; i < n; i++) {
+; a[i] = 0;
+; }
+; return Bar(a);
+; }
+;
+; CHECK: movq ___stack_chk_guard
+; CHECK: movq ___stack_chk_guard
+define zeroext i1 @_Z3Fooi(i32 %n) sspstrong {
+entry:
+ %n.addr = alloca i32, align 4
+ %a = alloca [10 x i32], align 16
+ %i = alloca i32, align 4
+ store i32 %n, i32* %n.addr, align 4
+ store i32 0, i32* %i, align 4
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %tmp = load i32, i32* %i, align 4
+ %tmp1 = load i32, i32* %n.addr, align 4
+ %cmp = icmp slt i32 %tmp, %tmp1
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %tmp2 = load i32, i32* %i, align 4
+ %idxprom = sext i32 %tmp2 to i64
+ %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 %idxprom
+ store i32 0, i32* %arrayidx, align 4
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %tmp3 = load i32, i32* %i, align 4
+ %inc = add nsw i32 %tmp3, 1
+ store i32 %inc, i32* %i, align 4
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %a, i32 0, i32 0
+ %call = call zeroext i1 @_Z3BarPi(i32* %arraydecay)
+ ret i1 %call
+}
+
+declare zeroext i1 @_Z3BarPi(i32*)
diff --git a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
index 4f7ff20c6e0d..163dc0bc9a0c 100644
--- a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=X64
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/ssse3-builtins.c
@@ -57,13 +58,13 @@ declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_alignr_epi8:
; X32: # BB#0:
-; X32-NEXT: palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X32-NEXT: palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_alignr_epi8:
; X64: # BB#0:
-; X64-NEXT: palignr {{.*#}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
+; X64-NEXT: palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -73,6 +74,25 @@ define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
ret <2 x i64> %res
}
+define <2 x i64> @test2_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test2_mm_alignr_epi8:
+; X32: # BB#0:
+; X32-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test2_mm_alignr_epi8:
+; X64: # BB#0:
+; X64-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %shuf = shufflevector <16 x i8> %arg0, <16 x i8> %arg1, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+ %res = bitcast <16 x i8> %shuf to <2 x i64>
+ ret <2 x i64> %res
+}
+
define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hadd_epi16:
; X32: # BB#0:
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index 0cff95f266a9..04bae023984f 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -61,3 +61,31 @@ entry:
; CHECK-NOT: and
; CHECK: ret
}
+
+%struct.sixteen = type { [16 x i8] }
+
+; Accessing stack parameters shouldn't assume stack alignment. Here we should
+; emit two 8-byte loads, followed by two 8-byte stores.
+define x86_stdcallcc void @test5(%struct.sixteen* byval nocapture readonly align 4 %s) #0 {
+ %d.sroa.0 = alloca [16 x i8], align 1
+ %1 = getelementptr inbounds [16 x i8], [16 x i8]* %d.sroa.0, i32 0, i32 0
+ call void @llvm.lifetime.start(i64 16, i8* %1)
+ %2 = getelementptr inbounds %struct.sixteen, %struct.sixteen* %s, i32 0, i32 0, i32 0
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 16, i32 1, i1 true)
+ call void @llvm.lifetime.end(i64 16, i8* %1)
+ ret void
+; CHECK-LABEL: test5:
+; CHECK: and
+; CHECK: movsd
+; CHECK-NEXT: movsd
+; CHECK-NEXT: movsd
+; CHECK-NEXT: movsd
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) argmemonly nounwind
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) argmemonly nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) argmemonly nounwind
+
+attributes #0 = { nounwind alignstack=16 "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/stack-align2.ll b/test/CodeGen/X86/stack-align2.ll
index 18cce7266d13..7239198000c9 100644
--- a/test/CodeGen/X86/stack-align2.ll
+++ b/test/CodeGen/X86/stack-align2.ll
@@ -1,7 +1,9 @@
; RUN: llc < %s -mcpu=generic -mtriple=i386-linux | FileCheck %s -check-prefix=LINUX-I386
+; RUN: llc < %s -mcpu=generic -mtriple=i386-kfreebsd | FileCheck %s -check-prefix=KFREEBSD-I386
; RUN: llc < %s -mcpu=generic -mtriple=i386-netbsd | FileCheck %s -check-prefix=NETBSD-I386
; RUN: llc < %s -mcpu=generic -mtriple=i686-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-I386
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=LINUX-X86_64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-kfreebsd | FileCheck %s -check-prefix=KFREEBSD-X86_64
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-netbsd | FileCheck %s -check-prefix=NETBSD-X86_64
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin8 | FileCheck %s -check-prefix=DARWIN-X86_64
@@ -11,6 +13,7 @@ entry:
ret i32 0
; LINUX-I386: subl $12, %esp
+; KFREEBSD-I386: subl $12, %esp
; DARWIN-I386: subl $12, %esp
; NETBSD-I386-NOT: subl {{.*}}, %esp
@@ -20,6 +23,8 @@ entry:
; DARWIN-X86_64-NOT: subq {{.*}}, %rsp
; NETBSD-X86_64: pushq %{{.*}}
; NETBSD-X86_64-NOT: subq {{.*}}, %rsp
+; KFREEBSD-X86_64: pushq %{{.*}}
+; KFREEBSD-X86_64-NOT: subq {{.*}}, %rsp
}
declare void @test2()
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index b86ec0ea22ff..5dfdf4b98adf 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -384,6 +384,14 @@ define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_cvtdq2pd
;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = sitofp <2 x i32> %2 to <2 x double>
+ ret <2 x double> %3
+}
+define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_int
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
ret <2 x double> %2
}
@@ -393,6 +401,14 @@ define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = sitofp <4 x i32> %a0 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0)
ret <4 x double> %2
}
@@ -488,6 +504,15 @@ define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtps2pd
;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+ %3 = fpext <2 x float> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtps2pd_int
+ ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
ret <2 x double> %2
}
@@ -497,6 +522,14 @@ define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtps2pd_ymm
;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fpext <4 x float> %a0 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtps2pd_ymm_int
+ ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0)
ret <4 x double> %2
}
@@ -524,7 +557,7 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
;CHECK-LABEL: stack_fold_cvtsd2si_int
- ;CHECK: cvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
ret i32 %2
@@ -535,7 +568,7 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
;CHECK-LABEL: stack_fold_cvtsd2si64_int
- ;CHECK: cvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
ret i64 %2
@@ -546,7 +579,7 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) {
;CHECK-LABEL: stack_fold_cvtsd2ss_int
- ;CHECK: cvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
ret <4 x float> %2
@@ -555,7 +588,7 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
define double @stack_fold_cvtsi2sd(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2sd
- ;CHECK: cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i32 %a0 to double
ret double %2
@@ -563,7 +596,7 @@ define double @stack_fold_cvtsi2sd(i32 %a0) {
define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2sd_int
- ;CHECK: cvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 0x0, double 0x0>, i32 %a0)
ret <2 x double> %2
@@ -572,7 +605,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnon
define double @stack_fold_cvtsi642sd(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642sd
- ;CHECK: cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i64 %a0 to double
ret double %2
@@ -580,7 +613,7 @@ define double @stack_fold_cvtsi642sd(i64 %a0) {
define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642sd_int
- ;CHECK: cvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2sdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> <double 0x0, double 0x0>, i64 %a0)
ret <2 x double> %2
@@ -589,7 +622,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readn
define float @stack_fold_cvtsi2ss(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2ss
- ;CHECK: cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i32 %a0 to float
ret float %2
@@ -597,7 +630,7 @@ define float @stack_fold_cvtsi2ss(i32 %a0) {
define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
;CHECK-LABEL: stack_fold_cvtsi2ss_int
- ;CHECK: cvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ ;CHECK: vcvtsi2ssl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i32 %a0)
ret <4 x float> %2
@@ -606,7 +639,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define float @stack_fold_cvtsi642ss(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642ss
- ;CHECK: cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = sitofp i64 %a0 to float
ret float %2
@@ -614,7 +647,7 @@ define float @stack_fold_cvtsi642ss(i64 %a0) {
define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
;CHECK-LABEL: stack_fold_cvtsi642ss_int
- ;CHECK: cvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ ;CHECK: vcvtsi2ssq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, i64 %a0)
ret <4 x float> %2
@@ -625,7 +658,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtss2sd_int
- ;CHECK: cvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
ret <2 x double> %2
@@ -748,7 +781,7 @@ define i64 @stack_fold_cvttss2si64(float %a0) {
define i64 @stack_fold_cvttss2si64_int(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvttss2si64_int
- ;CHECK: cvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vcvttss2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
ret i64 %2
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
new file mode 100644
index 000000000000..3ab96e3f4629
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -0,0 +1,137 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define double @stack_fold_addsd(double %a0, double %a1) {
+ ;CHECK-LABEL: stack_fold_addsd
+ ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fadd double %a0, %a1
+ ret double %2
+}
+
+define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_addsd_int
+ ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_addss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_addss
+ ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fadd float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_addss_int
+ ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_divsd_int
+ ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_divss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_divss
+ ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fdiv float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_divss_int
+ ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define double @stack_fold_mulsd(double %a0, double %a1) {
+ ;CHECK-LABEL: stack_fold_mulsd
+ ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fmul double %a0, %a1
+ ret double %2
+}
+
+define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_mulsd_int
+ ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_mulss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_mulss
+ ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fmul float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_mulss_int
+ ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define double @stack_fold_subsd(double %a0, double %a1) {
+ ;CHECK-LABEL: stack_fold_subsd
+ ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fsub double %a0, %a1
+ ret double %2
+}
+
+define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_subsd_int
+ ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)
+ ret <2 x double> %2
+}
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define float @stack_fold_subss(float %a0, float %a1) {
+ ;CHECK-LABEL: stack_fold_subss
+ ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = fsub float %a0, %a1
+ ret float %2
+}
+
+define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_subss_int
+ ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)
+ ret <4 x float> %2
+}
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 9f689cfe85e5..4c675356df6d 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -244,6 +244,15 @@ define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_cvtdq2pd
;CHECK: cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = sitofp <2 x i32> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_int
+ ;CHECK: cvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)
ret <2 x double> %2
}
@@ -287,6 +296,15 @@ define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_cvtps2pd
;CHECK: cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+ %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+ %3 = fpext <2 x float> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtps2pd_int
+ ;CHECK: cvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)
ret <2 x double> %2
}
diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
index 235a10ed4678..ef7fa2217145 100644
--- a/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -253,7 +253,9 @@ define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK: vpblendd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
- ret <4 x i32> %2
+ ; add forces execution domain
+ %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %3
}
define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
@@ -261,7 +263,9 @@ define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK: vpblendd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i32> %2
+ ; add forces execution domain
+ %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %3
}
define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) {
@@ -658,19 +662,19 @@ define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbd
;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0)
- ret <8 x i32> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = sext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
}
-declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbq
;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovsxbw
@@ -700,64 +704,61 @@ define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovsxwq
;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = sext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbd
;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0)
- ret <8 x i32> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = zext <8 x i8> %2 to <8 x i32>
+ ret <8 x i32> %3
}
-declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbq
;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_pmovzxbw
;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0)
+ %2 = zext <16 x i8> %a0 to <16 x i16>
ret <16 x i16> %2
}
-declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_pmovzxdq
;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0)
+ %2 = zext <4 x i32> %a0 to <4 x i64>
ret <4 x i64> %2
}
-declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovzxwd
;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0)
+ %2 = zext <8 x i16> %a0 to <8 x i32>
ret <8 x i32> %2
}
-declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
;CHECK-LABEL: stack_fold_pmovzxwq
;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0)
- ret <4 x i64> %2
+ %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = zext <4 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
}
-declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK-LABEL: stack_fold_pmuldq
diff --git a/test/CodeGen/X86/stack-folding-xop.ll b/test/CodeGen/X86/stack-folding-xop.ll
index d0c48b400804..115d1a9cad3a 100644
--- a/test/CodeGen/X86/stack-folding-xop.ll
+++ b/test/CodeGen/X86/stack-folding-xop.ll
@@ -166,69 +166,69 @@ define <8 x i16> @stack_fold_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
}
declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
-define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_rm
;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 0)
+ %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
ret <2 x double> %2
}
-define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_mr
;CHECK: vpermil2pd $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x double> %a1, i8 0)
+ %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x i64> %a1, i8 0)
ret <2 x double> %2
}
-declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
-define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_rm
;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 0)
+ %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
ret <4 x double> %2
}
-define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x double> %a2) {
;CHECK-LABEL: stack_fold_vpermil2pd_mr
;CHECK: vpermil2pd $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x double> %a1, i8 0)
+ %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x i64> %a1, i8 0)
ret <4 x double> %2
}
-declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
-define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_rm
;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 0)
+ %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 0)
ret <4 x float> %2
}
-define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_mr
;CHECK: vpermil2ps $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x float> %a1, i8 0)
+ %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x i32> %a1, i8 0)
ret <4 x float> %2
}
-declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
-define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_rm
;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 0)
+ %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 0)
ret <8 x float> %2
}
-define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x float> %a2) {
;CHECK-LABEL: stack_fold_vpermil2ps_mr
;CHECK: vpermil2ps $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x float> %a1, i8 0)
+ %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x i32> %a1, i8 0)
ret <8 x float> %2
}
-declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) {
;CHECK-LABEL: stack_fold_vphaddbd
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index 237b96603c00..8413b8ef82cb 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -25,7 +25,7 @@ attributes #0 = { sspreq }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!21, !72}
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.4 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !5, subprograms: !8, globals: !20, imports: !5)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.4 ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !5, globals: !20, imports: !5)
!1 = !DIFile(filename: "<unknown>", directory: "/Users/matt/ryan_bug")
!2 = !{!3}
!3 = !DICompositeType(tag: DW_TAG_enumeration_type, line: 20, size: 32, align: 32, file: !1, scope: !4, elements: !6)
@@ -33,8 +33,7 @@ attributes #0 = { sspreq }
!5 = !{}
!6 = !{!7}
!7 = !DIEnumerator(name: "max_frame_size", value: 0) ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
-!8 = !{!9, !24, !41, !65}
-!9 = distinct !DISubprogram(name: "read_response_size", linkageName: "_Z18read_response_sizev", line: 27, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 27, file: !1, scope: !10, type: !11, variables: !14)
+!9 = distinct !DISubprogram(name: "read_response_size", linkageName: "_Z18read_response_sizev", line: 27, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 27, file: !1, scope: !10, type: !11, variables: !14)
!10 = !DIFile(filename: "<unknown>", directory: "/Users/matt/ryan_bug")
!11 = !DISubroutineType(types: !12)
!12 = !{!13}
@@ -49,7 +48,7 @@ attributes #0 = { sspreq }
!21 = !{i32 2, !"Dwarf Version", i32 2}
!22 = !{i64* getelementptr inbounds ({ i64, [56 x i8] }, { i64, [56 x i8] }* @a, i32 0, i32 0)}
!23 = !DILocalVariable(name: "p2", line: 12, arg: 2, scope: !24, file: !10, type: !32)
-!24 = distinct !DISubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
+!24 = distinct !DISubprogram(name: "min<unsigned long long>", linkageName: "_ZN3__13minIyEERKT_S3_RS1_", line: 12, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 12, file: !1, scope: !25, type: !27, templateParams: !33, variables: !35)
!25 = !DINamespace(name: "__1", line: 1, file: !26, scope: null)
!26 = !DIFile(filename: "main.cpp", directory: "/Users/matt/ryan_bug")
!27 = !DISubroutineType(types: !28)
@@ -66,7 +65,7 @@ attributes #0 = { sspreq }
!38 = !DILocation(line: 33, scope: !9)
!39 = !DILocation(line: 12, scope: !24, inlinedAt: !38)
!40 = !DILocation(line: 9, scope: !41, inlinedAt: !59)
-!41 = distinct !DISubprogram(name: "min<unsigned long long, __1::A>", linkageName: "_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 8, file: !1, scope: !25, type: !42, templateParams: !53, variables: !55)
+!41 = distinct !DISubprogram(name: "min<unsigned long long, __1::A>", linkageName: "_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 8, file: !1, scope: !25, type: !42, templateParams: !53, variables: !55)
!42 = !DISubroutineType(types: !43)
!43 = !{!29, !29, !32, !44}
!44 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", size: 8, align: 8, file: !1, scope: !25, elements: !45)
@@ -86,7 +85,7 @@ attributes #0 = { sspreq }
!59 = !DILocation(line: 13, scope: !24, inlinedAt: !38)
!63 = !{i32 undef}
!64 = !DILocalVariable(name: "p1", line: 1, arg: 2, scope: !65, file: !10, type: !50)
-!65 = distinct !DISubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
+!65 = distinct !DISubprogram(name: "operator()", linkageName: "_ZN3__11AclERKiS2_", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !25, type: !47, declaration: !46, variables: !66)
!66 = !{!67, !69, !70}
!67 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !65, type: !68)
!68 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !44)
diff --git a/test/CodeGen/X86/stack-protector-msvc.ll b/test/CodeGen/X86/stack-protector-msvc.ll
new file mode 100644
index 000000000000..5eccc65f2dec
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-msvc.ll
@@ -0,0 +1,40 @@
+
+; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-64 %s
+
+; MSVC-I386: movl ___security_cookie, %[[REG1:[a-z]*]]
+; MSVC-I386: movl %[[REG1]], [[SLOT:[0-9]*]](%esp)
+; MSVC-I386: calll _strcpy
+; MSVC-I386: movl [[SLOT]](%esp), %ecx
+; MSVC-I386: calll @__security_check_cookie@4
+; MSVC-I386: retl
+
+; MSVC-64: movq __security_cookie(%rip), %[[REG1:[a-z]*]]
+; MSVC-64: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp)
+; MSVC-64: callq strcpy
+; MSVC-64: movq [[SLOT]](%rsp), %rcx
+; MSVC-64: callq __security_check_cookie
+
+@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00" ; <[11 x i8]*> [#uses=1]
+
+define void @test(i8* %a) nounwind ssp {
+entry:
+ %a_addr = alloca i8* ; <i8**> [#uses=2]
+ %buf = alloca [8 x i8] ; <[8 x i8]*> [#uses=2]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ store i8* %a, i8** %a_addr
+ %buf1 = bitcast [8 x i8]* %buf to i8* ; <i8*> [#uses=1]
+ %0 = load i8*, i8** %a_addr, align 4 ; <i8*> [#uses=1]
+ %1 = call i8* @strcpy(i8* %buf1, i8* %0) nounwind ; <i8*> [#uses=0]
+ %buf2 = bitcast [8 x i8]* %buf to i8* ; <i8*> [#uses=1]
+ %2 = call i32 (i8*, ...) @printf(i8* getelementptr ([11 x i8], [11 x i8]* @"\01LC", i32 0, i32 0), i8* %buf2) nounwind ; <i32> [#uses=0]
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+declare i8* @strcpy(i8*, i8*) nounwind
+
+declare i32 @printf(i8*, ...) nounwind
+
diff --git a/test/CodeGen/X86/stack-protector-target.ll b/test/CodeGen/X86/stack-protector-target.ll
new file mode 100644
index 000000000000..66e45055b2b5
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-target.ll
@@ -0,0 +1,27 @@
+; Test target-specific stack cookie location.
+; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -mtriple=i386-kfreebsd < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-kfreebsd < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+
+define void @_Z1fv() sspreq {
+entry:
+ %x = alloca i32, align 4
+ %0 = bitcast i32* %x to i8*
+ call void @_Z7CapturePi(i32* nonnull %x)
+ ret void
+}
+
+declare void @_Z7CapturePi(i32*)
+
+; LINUX-X64: movq %fs:40, %[[B:.*]]
+; LINUX-X64: movq %[[B]], 16(%rsp)
+; LINUX-X64: movq %fs:40, %[[C:.*]]
+; LINUX-X64: cmpq 16(%rsp), %[[C]]
+
+; LINUX-I386: movl %gs:20, %[[B:.*]]
+; LINUX-I386: movl %[[B]], 8(%esp)
+; LINUX-I386: movl %gs:20, %[[C:.*]]
+; LINUX-I386: cmpl 8(%esp), %[[C]]
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index dea66d28e3dd..58c6c713941d 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -1,17 +1,31 @@
-; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=SELDAG
-; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DARWIN-SELDAG
+; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=DARWIN-IR
+; RUN: llc -mtriple=i386-pc-windows-msvc -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=MSVC-SELDAG
+; RUN: llc -mtriple=i386-pc-windows-msvc -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=MSVC-IR
-; SELDAG: # Machine code for function test_branch_weights:
-; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
-; SELDAG: BB#[[FAILURE]]:
-; SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
-; SELDAG: BB#[[SUCCESS]]:
+; DARWIN-SELDAG: # Machine code for function test_branch_weights:
+; DARWIN-SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
+; DARWIN-SELDAG: BB#[[FAILURE]]:
+; DARWIN-SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
+; DARWIN-SELDAG: BB#[[SUCCESS]]:
-; IR: # Machine code for function test_branch_weights:
-; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
-; IR: BB#[[SUCCESS]]:
-; IR: BB#[[FAILURE]]:
-; IR: CALL64pcrel32 <ga:@__stack_chk_fail>
+; DARWIN-IR: # Machine code for function test_branch_weights:
+; DARWIN-IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
+; DARWIN-IR: BB#[[SUCCESS]]:
+; DARWIN-IR: BB#[[FAILURE]]:
+; DARWIN-IR: CALL64pcrel32 <ga:@__stack_chk_fail>
+
+; MSVC-SELDAG: # Machine code for function test_branch_weights:
+; MSVC-SELDAG: mem:Volatile LD4[@__security_cookie]
+; MSVC-SELDAG: ST4[FixedStack0]
+; MSVC-SELDAG: LD4[FixedStack0]
+; MSVC-SELDAG: CALLpcrel32 <ga:@__security_check_cookie>
+
+; MSVC-IR: # Machine code for function test_branch_weights:
+; MSVC-IR: mem:Volatile LD4[@__security_cookie]
+; MSVC-IR: ST4[FixedStack0]
+; MSVC-IR: LD4[%StackGuardSlot]
+; MSVC-IR: CALLpcrel32 <ga:@__security_check_cookie>
define i32 @test_branch_weights(i32 %n) #0 {
entry:
@@ -33,4 +47,4 @@ declare void @foo2(i32*)
declare void @llvm.lifetime.end(i64, i8* nocapture)
-attributes #0 = { ssp "stack-protector-buffer-size"="8" }
+attributes #0 = { sspstrong "stack-protector-buffer-size"="8" }
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index 398b8548747b..ddfb14ca8cfe 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -3,6 +3,7 @@
; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-KERNEL-X64 %s
; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | FileCheck --check-prefix=DARWIN-X64 %s
; RUN: llc -mtriple=amd64-pc-openbsd < %s -o - | FileCheck --check-prefix=OPENBSD-AMD64 %s
+; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
%struct.foo = type { [16 x i8] }
%struct.foo.0 = type { [4 x i8] }
@@ -40,6 +41,10 @@ entry:
; DARWIN-X64-LABEL: test1a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test1a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -76,6 +81,10 @@ entry:
; OPENBSD-AMD64-LABEL: test1b:
; OPENBSD-AMD64: movq __guard_local(%rip)
; OPENBSD-AMD64: callq __stack_smash_handler
+
+; MSVC-I386-LABEL: test1b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -108,6 +117,10 @@ entry:
; DARWIN-X64-LABEL: test1c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test1c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -140,6 +153,10 @@ entry:
; DARWIN-X64-LABEL: test1d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test1d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [16 x i8], align 16
store i8* %a, i8** %a.addr, align 8
@@ -171,6 +188,10 @@ entry:
; DARWIN-X64-LABEL: test2a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test2a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo, align 1
store i8* %a, i8** %a.addr, align 8
@@ -239,6 +260,10 @@ entry:
; DARWIN-X64-LABEL: test2c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test2c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo, align 1
store i8* %a, i8** %a.addr, align 8
@@ -273,6 +298,10 @@ entry:
; DARWIN-X64-LABEL: test2d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test2d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo, align 1
store i8* %a, i8** %a.addr, align 8
@@ -306,6 +335,10 @@ entry:
; DARWIN-X64-LABEL: test3a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test3a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -338,6 +371,10 @@ entry:
; DARWIN-X64-LABEL: test3b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test3b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -370,6 +407,10 @@ entry:
; DARWIN-X64-LABEL: test3c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test3c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -402,6 +443,10 @@ entry:
; DARWIN-X64-LABEL: test3d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test3d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%buf = alloca [4 x i8], align 1
store i8* %a, i8** %a.addr, align 8
@@ -433,6 +478,10 @@ entry:
; DARWIN-X64-LABEL: test4a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test4a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -467,6 +516,10 @@ entry:
; DARWIN-X64-LABEL: test4b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test4b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -501,6 +554,10 @@ entry:
; DARWIN-X64-LABEL: test4c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test4c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -535,6 +592,10 @@ entry:
; DARWIN-X64-LABEL: test4d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test4d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
%b = alloca %struct.foo.0, align 1
store i8* %a, i8** %a.addr, align 8
@@ -568,6 +629,10 @@ entry:
; DARWIN-X64-LABEL: test5a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test5a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -596,6 +661,10 @@ entry:
; DARWIN-X64-LABEL: test5b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test5b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -624,6 +693,10 @@ entry:
; DARWIN-X64-LABEL: test5c:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test5c:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -652,6 +725,10 @@ entry:
; DARWIN-X64-LABEL: test5d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test5d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a.addr = alloca i8*, align 8
store i8* %a, i8** %a.addr, align 8
%0 = load i8*, i8** %a.addr, align 8
@@ -679,6 +756,10 @@ entry:
; DARWIN-X64-LABEL: test6a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test6a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -711,6 +792,11 @@ entry:
; DARWIN-X64-LABEL: test6b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+
+; MSVC-I386-LABEL: test6b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -743,6 +829,10 @@ entry:
; DARWIN-X64-LABEL: test6c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test6c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -775,6 +865,10 @@ entry:
; DARWIN-X64-LABEL: test6d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test6d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%retval = alloca i32, align 4
%a = alloca i32, align 4
%j = alloca i32*, align 8
@@ -806,6 +900,10 @@ entry:
; DARWIN-X64-LABEL: test7a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test7a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -833,6 +931,10 @@ entry:
; DARWIN-X64-LABEL: test7b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test7b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -860,6 +962,10 @@ entry:
; DARWIN-X64-LABEL: test7c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test7c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -887,6 +993,10 @@ entry:
; DARWIN-X64-LABEL: test7d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test7d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%0 = ptrtoint i32* %a to i64
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 %0)
@@ -913,6 +1023,10 @@ entry:
; DARWIN-X64-LABEL: test8a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test8a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -939,6 +1053,10 @@ entry:
; DARWIN-X64-LABEL: test8b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test8b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -965,6 +1083,10 @@ entry:
; DARWIN-X64-LABEL: test8c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test8c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -991,6 +1113,10 @@ entry:
; DARWIN-X64-LABEL: test8d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test8d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%b = alloca i32, align 4
call void @funcall(i32* %b)
ret void
@@ -1016,6 +1142,10 @@ entry:
; DARWIN-X64-LABEL: test9a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test9a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1046,6 +1176,10 @@ entry:
; DARWIN-X64-LABEL: test9b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test9b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1076,6 +1210,10 @@ entry:
; DARWIN-X64-LABEL: test9c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test9c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1106,6 +1244,10 @@ entry:
; DARWIN-X64-LABEL: test9d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test9d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1135,6 +1277,10 @@ entry:
; DARWIN-X64-LABEL: test10a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test10a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1180,6 +1326,10 @@ entry:
; DARWIN-X64-LABEL: test10b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test10b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1225,6 +1375,10 @@ entry:
; DARWIN-X64-LABEL: test10c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test10c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1270,6 +1424,10 @@ entry:
; DARWIN-X64-LABEL: test10d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test10d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca double, align 8
%call = call double @testi_aux()
store double %call, double* %x, align 8
@@ -1314,6 +1472,10 @@ entry:
; DARWIN-X64-LABEL: test11a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test11a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1344,6 +1506,10 @@ entry:
; DARWIN-X64-LABEL: test11b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test11b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1374,6 +1540,10 @@ entry:
; DARWIN-X64-LABEL: test11c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test11c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1404,6 +1574,10 @@ entry:
; DARWIN-X64-LABEL: test11d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test11d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1433,6 +1607,10 @@ entry:
; DARWIN-X64-LABEL: test12a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test12a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1462,6 +1640,10 @@ entry:
; DARWIN-X64-LABEL: test12b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test12b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1490,6 +1672,10 @@ entry:
; DARWIN-X64-LABEL: test12c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test12c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1519,6 +1705,10 @@ entry:
; DARWIN-X64-LABEL: test12d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test12d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%b = alloca i32*, align 8
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i32 0, i32 1
@@ -1547,6 +1737,10 @@ entry:
; DARWIN-X64-LABEL: test13a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test13a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1574,6 +1768,10 @@ entry:
; DARWIN-X64-LABEL: test13b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test13b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1601,6 +1799,10 @@ entry:
; DARWIN-X64-LABEL: test13c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test13c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1628,6 +1830,10 @@ entry:
; DARWIN-X64-LABEL: test13d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test13d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%y = getelementptr inbounds %struct.pair, %struct.pair* %c, i64 0, i32 1
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %y)
@@ -1654,6 +1860,10 @@ entry:
; DARWIN-X64-LABEL: test14a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test14a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1681,6 +1891,10 @@ entry:
; DARWIN-X64-LABEL: test14b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test14b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1708,6 +1922,10 @@ entry:
; DARWIN-X64-LABEL: test14c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test14c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1735,6 +1953,10 @@ entry:
; DARWIN-X64-LABEL: test14d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test14d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%add.ptr5 = getelementptr inbounds i32, i32* %a, i64 -12
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
@@ -1762,6 +1984,10 @@ entry:
; DARWIN-X64-LABEL: test15a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test15a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1794,6 +2020,10 @@ entry:
; DARWIN-X64-LABEL: test15b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test15b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1826,6 +2056,10 @@ entry:
; DARWIN-X64-LABEL: test15c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test15c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1858,6 +2092,10 @@ entry:
; DARWIN-X64-LABEL: test15d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test15d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%b = alloca float*, align 8
store i32 0, i32* %a, align 4
@@ -1889,6 +2127,10 @@ entry:
; DARWIN-X64-LABEL: test16a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test16a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -1918,6 +2160,10 @@ entry:
; DARWIN-X64-LABEL: test16b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test16b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -1947,6 +2193,10 @@ entry:
; DARWIN-X64-LABEL: test16c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test16c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -1976,6 +2226,10 @@ entry:
; DARWIN-X64-LABEL: test16d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test16d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
store i32 0, i32* %a, align 4
%0 = bitcast i32* %a to float*
@@ -2003,6 +2257,10 @@ entry:
; DARWIN-X64-LABEL: test17a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test17a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2031,6 +2289,10 @@ entry:
; DARWIN-X64-LABEL: test17b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test17b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2059,6 +2321,10 @@ entry:
; DARWIN-X64-LABEL: test17c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test17c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2087,6 +2353,10 @@ entry:
; DARWIN-X64-LABEL: test17d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test17d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.vec, align 16
%y = getelementptr inbounds %struct.vec, %struct.vec* %c, i64 0, i32 0
%add.ptr = getelementptr inbounds <4 x i32>, <4 x i32>* %y, i64 -12
@@ -2114,6 +2384,10 @@ entry:
; DARWIN-X64-LABEL: test18a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test18a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2151,6 +2425,10 @@ entry:
; DARWIN-X64-LABEL: test18b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test18b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2188,6 +2466,10 @@ entry:
; DARWIN-X64-LABEL: test18c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test18c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2225,6 +2507,10 @@ entry:
; DARWIN-X64-LABEL: test18d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test18d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2261,6 +2547,10 @@ entry:
; DARWIN-X64-LABEL: test19a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test19a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2301,6 +2591,10 @@ entry:
; DARWIN-X64-LABEL: test19b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test19b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2341,6 +2635,10 @@ entry:
; DARWIN-X64-LABEL: test19c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test19c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2385,6 +2683,10 @@ entry:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
; DARWIN-X64-NOT: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test19d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%c = alloca %struct.pair, align 4
%exn.slot = alloca i8*
%ehselector.slot = alloca i32
@@ -2423,6 +2725,10 @@ entry:
; DARWIN-X64-LABEL: test20a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test20a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2454,6 +2760,10 @@ entry:
; DARWIN-X64-LABEL: test20b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test20b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2485,6 +2795,10 @@ entry:
; DARWIN-X64-LABEL: test20c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test20c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2516,6 +2830,10 @@ entry:
; DARWIN-X64-LABEL: test20d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test20d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca i32**, align 8
%call = call i32* @getp()
@@ -2546,6 +2864,10 @@ entry:
; DARWIN-X64-LABEL: test21a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test21a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2578,6 +2900,10 @@ entry:
; DARWIN-X64-LABEL: test21b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test21b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2610,6 +2936,10 @@ entry:
; DARWIN-X64-LABEL: test21c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test21c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2642,6 +2972,10 @@ entry:
; DARWIN-X64-LABEL: test21d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test21d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca i32*, align 8
%b = alloca float**, align 8
%call = call i32* @getp()
@@ -2673,6 +3007,10 @@ entry:
; DARWIN-X64-LABEL: test22a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test22a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2701,6 +3039,10 @@ entry:
; DARWIN-X64-LABEL: test22b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test22b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2729,6 +3071,10 @@ entry:
; DARWIN-X64-LABEL: test22c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test22c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2757,6 +3103,10 @@ entry:
; DARWIN-X64-LABEL: test22d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test22d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca %class.A, align 1
%array = getelementptr inbounds %class.A, %class.A* %a, i32 0, i32 0
%arrayidx = getelementptr inbounds [2 x i8], [2 x i8]* %array, i32 0, i64 0
@@ -2784,6 +3134,10 @@ entry:
; DARWIN-X64-LABEL: test23a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test23a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2816,6 +3170,10 @@ entry:
; DARWIN-X64-LABEL: test23b:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test23b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2848,6 +3206,10 @@ entry:
; DARWIN-X64-LABEL: test23c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test23c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2880,6 +3242,10 @@ entry:
; DARWIN-X64-LABEL: test23d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test23d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%x = alloca %struct.deep, align 1
%b = getelementptr inbounds %struct.deep, %struct.deep* %x, i32 0, i32 0
%c = bitcast %union.anon* %b to %struct.anon*
@@ -2911,6 +3277,10 @@ entry:
; DARWIN-X64-LABEL: test24a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test24a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -2943,6 +3313,10 @@ entry:
; DARWIN-X64-LABEL: test24b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test24b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -2975,6 +3349,10 @@ entry:
; DARWIN-X64-LABEL: test24c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test24c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -3007,6 +3385,10 @@ entry:
; DARWIN-X64-LABEL: test24d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test24d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%n.addr = alloca i32, align 4
%a = alloca i32*, align 8
store i32 %n, i32* %n.addr, align 4
@@ -3038,6 +3420,10 @@ entry:
; DARWIN-X64-LABEL: test25a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test25a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3065,6 +3451,10 @@ entry:
; DARWIN-X64-LABEL: test25b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test25b:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3092,6 +3482,10 @@ entry:
; DARWIN-X64-LABEL: test25c:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test25c:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3119,6 +3513,10 @@ entry:
; DARWIN-X64-LABEL: test25d:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test25d:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%a = alloca [4 x i32], align 16
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32]* %a, i32 0, i64 0
%0 = load i32, i32* %arrayidx, align 4
@@ -3148,6 +3546,10 @@ entry:
; DARWIN-X64-LABEL: test26:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test26:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%c = alloca %struct.nest, align 4
%b = getelementptr inbounds %struct.nest, %struct.nest* %c, i32 0, i32 1
%_a = getelementptr inbounds %struct.pair, %struct.pair* %b, i32 0, i32 0
@@ -3180,6 +3582,10 @@ bb:
; DARWIN-X64-LABEL: test27:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test27:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%tmp = alloca %struct.small*, align 8
%tmp1 = call i32 (...) @dummy(%struct.small** %tmp)
%tmp2 = load %struct.small*, %struct.small** %tmp, align 8
@@ -3233,6 +3639,10 @@ entry:
; DARWIN-X64-LABEL: test28a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test28a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca [32 x i8], align 16
%arraydecay = getelementptr inbounds [32 x i8], [32 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3259,6 +3669,10 @@ entry:
; DARWIN-X64-LABEL: test28b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test28b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca [33 x i8], align 16
%arraydecay = getelementptr inbounds [33 x i8], [33 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3285,6 +3699,10 @@ entry:
; DARWIN-X64-LABEL: test29a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test29a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca [4 x i8], align 1
%arraydecay = getelementptr inbounds [4 x i8], [4 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3311,6 +3729,10 @@ entry:
; DARWIN-X64-LABEL: test29b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test29b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca [5 x i8], align 1
%arraydecay = getelementptr inbounds [5 x i8], [5 x i8]* %test, i32 0, i32 0
%call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
@@ -3338,6 +3760,10 @@ entry:
; DARWIN-X64-LABEL: test30a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test30a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca %struct.small_char, align 4
%test.coerce = alloca { i64, i8 }
%0 = bitcast { i64, i8 }* %test.coerce to i8*
@@ -3372,6 +3798,10 @@ entry:
; DARWIN-X64-LABEL: test30b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test30b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca %struct.small_char, align 4
%test.coerce = alloca { i64, i8 }
%0 = bitcast { i64, i8 }* %test.coerce to i8*
@@ -3406,6 +3836,10 @@ entry:
; DARWIN-X64-LABEL: test31a:
; DARWIN-X64-NOT: callq ___stack_chk_fail
; DARWIN-X64: .cfi_endproc
+
+; MSVC-I386-LABEL: test31a:
+; MSVC-I386-NOT: calll @__security_check_cookie@4
+; MSVC-I386: retl
%test = alloca i8*, align 8
%0 = alloca i8, i64 4
store i8* %0, i8** %test, align 8
@@ -3434,6 +3868,10 @@ entry:
; DARWIN-X64-LABEL: test31b:
; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
; DARWIN-X64: callq ___stack_chk_fail
+
+; MSVC-I386-LABEL: test31b:
+; MSVC-I386: movl ___security_cookie,
+; MSVC-I386: calll @__security_check_cookie@4
%test = alloca i8*, align 8
%0 = alloca i8, i64 5
store i8* %0, i8** %test, align 8
@@ -3442,6 +3880,17 @@ entry:
ret i32 %call
}
+define void @__stack_chk_fail() #1 !dbg !6 {
+entry:
+ ret void
+}
+
+define void @test32() #1 !dbg !7 {
+entry:
+ %0 = alloca [5 x i8], align 1
+ ret void
+}
+
declare double @testi_aux()
declare i8* @strcpy(i8*, i8*)
declare i32 @printf(i8*, ...)
@@ -3461,3 +3910,16 @@ attributes #2 = { sspreq }
attributes #3 = { ssp "stack-protector-buffer-size"="33" }
attributes #4 = { ssp "stack-protector-buffer-size"="5" }
attributes #5 = { ssp "stack-protector-buffer-size"="6" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version x.y.z"}
+!6 = distinct !DISubprogram(name: "__stack_chk_fail", scope: !1, unit: !0)
+!7 = distinct !DISubprogram(name: "foo", scope: !1, unit: !0)
diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll
index 90ac2cc601fa..d38c68a8a5bb 100644
--- a/test/CodeGen/X86/stack_guard_remat.ll
+++ b/test/CodeGen/X86/stack_guard_remat.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
;CHECK: foo2
;CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), [[R0:%[a-z0-9]+]]
diff --git a/test/CodeGen/X86/stackguard-internal.ll b/test/CodeGen/X86/stackguard-internal.ll
new file mode 100644
index 000000000000..328e04b9a718
--- /dev/null
+++ b/test/CodeGen/X86/stackguard-internal.ll
@@ -0,0 +1,15 @@
+; Check that the backend doesn't crash.
+; RUN: llc -mtriple=x86_64-pc-freebsd %s -o - | FileCheck %s
+
+@__stack_chk_guard = internal global [8 x i64] zeroinitializer, align 16
+
+define void @f() sspstrong {
+ %tbl = alloca [4 x i64], align 16
+ ret void
+}
+
+; CHECK: movq __stack_chk_guard(%rip), %rax
+; CHECK: movq __stack_chk_guard(%rip), %rax
+; CHECK: .type __stack_chk_guard,@object
+; CHECK: .local __stack_chk_guard
+; CHECK: .comm __stack_chk_guard,64,16
diff --git a/test/CodeGen/X86/stackmap-frame-setup.ll b/test/CodeGen/X86/stackmap-frame-setup.ll
index 076e2482f8ba..b83a8d61f6a2 100644
--- a/test/CodeGen/X86/stackmap-frame-setup.ll
+++ b/test/CodeGen/X86/stackmap-frame-setup.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
-; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o - -verify-machineinstrs -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
define void @caller_meta_leaf() {
entry:
diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll
index a38b9209a1cf..0143a4e0fbc8 100644
--- a/test/CodeGen/X86/stackmap-large-constants.ll
+++ b/test/CodeGen/X86/stackmap-large-constants.ll
@@ -46,7 +46,7 @@
; NumLiveOuts
; CHECK-NEXT: .short 0
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 599b6265abfa..d2dd263a6174 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -34,7 +34,7 @@ entry:
; Num LiveOut Entries: 0
; CHECK-NEXT: .short 0
; Align
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; StackMap 1 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_stackmap_liveness
@@ -49,7 +49,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 12, i8* null, i32 0)
%a2 = call i64 asm sideeffect "", "={r8}"() nounwind
%a3 = call i8 asm sideeffect "", "={ah}"() nounwind
@@ -65,7 +65,7 @@ entry:
; Num LiveOut Entries: 0
; CHECK-NEXT: .short 0
; Align
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; StackMap 2 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_stackmap_liveness
@@ -96,7 +96,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 2, i32 12, i8* null, i32 0)
call void asm sideeffect "", "{r8},{ah},{ymm0},{ymm1}"(i64 %a2, i8 %a3, <4 x double> %a4, <4 x double> %a5) nounwind
@@ -109,7 +109,7 @@ entry:
; Num LiveOut Entries: 0
; CHECK-NEXT: .short 0
; Align
-; CHECK-NEXT: .align 3
+; CHECK-NEXT: .p2align 3
; StackMap 3 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_stackmap_liveness
@@ -128,7 +128,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 12, i8* null, i32 0)
call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
ret void
@@ -146,7 +146,7 @@ entry:
; Num LiveOut Entries: 0
; PATCH-NEXT: .short 0
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
; StackMap 5 (patchpoint liveness information enabled)
; PATCH-LABEL: .long L{{.*}}-_mixed_liveness
@@ -165,7 +165,7 @@ entry:
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
; Align
-; PATCH-NEXT: .align 3
+; PATCH-NEXT: .p2align 3
call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 5)
call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
diff --git a/test/CodeGen/X86/statepoint-allocas.ll b/test/CodeGen/X86/statepoint-allocas.ll
index fa2621e7d2fe..040ab614d0a8 100644
--- a/test/CodeGen/X86/statepoint-allocas.ll
+++ b/test/CodeGen/X86/statepoint-allocas.ll
@@ -96,7 +96,7 @@ declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i3
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
; The Deopt one
; CHECK: .long .Ltmp3-test2
@@ -126,5 +126,5 @@ declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i3
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 1d38b2facc73..3e8b8ca49f1d 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -41,7 +41,7 @@ exceptional_return:
; CHECK: .long .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
; CHECK: .long .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
; CHECK: .byte 0
-; CHECK: .align 4
+; CHECK: .p2align 4
define i64 addrspace(1)* @test_result(i64 addrspace(1)* %obj,
i64 addrspace(1)* %obj1)
@@ -71,7 +71,7 @@ exceptional_return:
; CHECK: .long .Ltmp{{[0-9]+}}-.Ltmp{{[0-9]+}}
; CHECK: .long .Ltmp{{[0-9]+}}-.Lfunc_begin{{[0-9]+}}
; CHECK: .byte 0
-; CHECK: .align 4
+; CHECK: .p2align 4
define i64 addrspace(1)* @test_same_val(i1 %cond, i64 addrspace(1)* %val1, i64 addrspace(1)* %val2, i64 addrspace(1)* %val3)
gc "statepoint-example" personality i32 ()* @"personality_function" {
diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll
index d4784212810f..d4bc7d47f669 100644
--- a/test/CodeGen/X86/statepoint-stack-usage.ll
+++ b/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll
index 4f8b2ce6efd9..2b1357a1179a 100644
--- a/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ b/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
-; RUN: llc < %s -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
; This test is a sanity check to ensure statepoints are generating StackMap
; sections correctly. This is not intended to be a rigorous test of the
@@ -168,7 +168,7 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
;
; test_derived_arg
@@ -235,7 +235,7 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
; No Padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
; Records for the test_id function:
@@ -275,5 +275,5 @@ declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32)
; No padding or LiveOuts
; CHECK: .short 0
; CHECK: .short 0
-; CHECK: .align 8
+; CHECK: .p2align 3
diff --git a/test/CodeGen/X86/statepoint-uniqueing.ll b/test/CodeGen/X86/statepoint-uniqueing.ll
new file mode 100644
index 000000000000..e791bc6b2333
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-uniqueing.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+
+; Checks for a crash we had when two gc.relocate calls would
+; relocating identical values
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+declare void @f()
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token, i32, i32) #3
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32) #3
+
+define void @test_gcrelocate_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test_gcrelocate_uniqueing
+ %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
+ @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 undef, i32 addrspace(1)* %ptr, i32 addrspace(1)* %ptr)
+ %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 9, i32 9)
+ %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 10, i32 10)
+ ret void
+}
+
+define void @test_gcptr_uniqueing(i32 addrspace(1)* %ptr) gc "statepoint-example" {
+; CHECK-LABEL: test_gcptr_uniqueing
+ %ptr2 = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
+ %tok = tail call token (i64, i32, void ()*, i32, i32, ...)
+ @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @f, i32 0, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 undef, i32 addrspace(1)* %ptr, i8 addrspace(1)* %ptr2)
+ %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %tok, i32 9, i32 9)
+ %b = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok, i32 10, i32 10)
+ ret void
+}
diff --git a/test/CodeGen/X86/statepoint-vector-bad-spill.ll b/test/CodeGen/X86/statepoint-vector-bad-spill.ll
new file mode 100644
index 000000000000..848988589cb0
--- /dev/null
+++ b/test/CodeGen/X86/statepoint-vector-bad-spill.ll
@@ -0,0 +1,39 @@
+; RUN: llc -O3 < %s | FileCheck %s
+
+; This is checking for a crash.
+
+target triple = "x86_64-pc-linux-gnu"
+
+define <2 x i8 addrspace(1)*> @test0(i8 addrspace(1)* %el, <2 x i8 addrspace(1)*>* %vec_ptr) gc "statepoint-example" {
+; CHECK-LABEL: test0:
+
+entry:
+ %tok0 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %el)
+ %el.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok0, i32 7, i32 7)
+
+ %obj.pre = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*>* %vec_ptr
+ %obj = insertelement <2 x i8 addrspace(1)*> %obj.pre, i8 addrspace(1)* %el.relocated, i32 0 ; No real objective here, except to use %el
+
+ %tok1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> %obj)
+ %obj.relocated = call <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %tok1, i32 7, i32 7)
+ ret <2 x i8 addrspace(1)*> %obj.relocated
+}
+
+define i8 addrspace(1)* @test1(<2 x i8 addrspace(1)*> %obj) gc "statepoint-example" {
+; CHECK-LABEL: test1:
+
+entry:
+ %tok1 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, <2 x i8 addrspace(1)*> %obj)
+ %obj.relocated = call <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token %tok1, i32 7, i32 7)
+
+ %el = extractelement <2 x i8 addrspace(1)*> %obj.relocated, i32 0
+ %tok0 = call token (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* @do_safepoint, i32 0, i32 0, i32 0, i32 0, i8 addrspace(1)* %el)
+ %el.relocated = call i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token %tok0, i32 7, i32 7)
+ ret i8 addrspace(1)* %el.relocated
+}
+
+declare void @do_safepoint()
+
+declare token @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+declare i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(token, i32, i32)
+declare <2 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v2p1i8(token, i32, i32)
diff --git a/test/CodeGen/X86/statepoint-vector.ll b/test/CodeGen/X86/statepoint-vector.ll
index 9d80e9217b49..21e7b204a070 100644
--- a/test/CodeGen/X86/statepoint-vector.ll
+++ b/test/CodeGen/X86/statepoint-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=core-avx -debug-only=stackmaps < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mcpu=nehalem -debug-only=stackmaps < %s | FileCheck %s
; REQUIRES: asserts
target triple = "x86_64-pc-linux-gnu"
diff --git a/test/CodeGen/X86/stdarg.ll b/test/CodeGen/X86/stdarg.ll
index 42cbcb1008d3..7b4f4e845fce 100644
--- a/test/CodeGen/X86/stdarg.ll
+++ b/test/CodeGen/X86/stdarg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s -mtriple=x86_64-linux | FileCheck %s
%struct.__va_list_tag = type { i32, i32, i8*, i8* }
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 6c1c56e43a4c..16f152d169d3 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -1,6 +1,7 @@
; rdar://7860110
; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=X64
-; RUN: llc -march=x86 -asm-verbose=false < %s | FileCheck %s -check-prefix=X32
+; RUN: llc -march=x86 -asm-verbose=false -fixup-byte-word-insts=1 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWON
+; RUN: llc -march=x86 -asm-verbose=false -fixup-byte-word-insts=0 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWOFF
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.2"
@@ -50,7 +51,8 @@ entry:
; X64: movw %si, (%rdi)
; X32-LABEL: test3:
-; X32: movw 8(%esp), %ax
+; X32-BWON: movzwl 8(%esp), %eax
+; X32-BWOFF: movw 8(%esp), %ax
; X32: movw %ax, (%{{.*}})
}
@@ -67,7 +69,8 @@ entry:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test4:
-; X32: movw 8(%esp), %[[REG:[abcd]]]x
+; X32-BWON: movzwl 8(%esp), %e[[REG:[abcd]]]x
+; X32-BWOFF: movw 8(%esp), %[[REG:[abcd]]]x
; X32: movw %[[REG]]x, 2(%{{.*}})
}
@@ -84,7 +87,8 @@ entry:
; X64: movw %si, 2(%rdi)
; X32-LABEL: test5:
-; X32: movw 8(%esp), %[[REG:[abcd]]]x
+; X32-BWON: movzwl 8(%esp), %e[[REG:[abcd]]]x
+; X32-BWOFF: movw 8(%esp), %[[REG:[abcd]]]x
; X32: movw %[[REG]]x, 2(%{{.*}})
}
diff --git a/test/CodeGen/X86/store-zero-and-minus-one.ll b/test/CodeGen/X86/store-zero-and-minus-one.ll
new file mode 100644
index 000000000000..14790018e050
--- /dev/null
+++ b/test/CodeGen/X86/store-zero-and-minus-one.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK32 --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK
+
+define void @zero_optsize(i32* %p) optsize {
+entry:
+ store i32 0, i32* %p
+ ret void
+
+; CHECK-LABEL: zero_optsize:
+; CHECK: movl $0
+; CHECK: ret
+}
+
+define void @minus_one_optsize(i32* %p) optsize {
+entry:
+ store i32 -1, i32* %p
+ ret void
+
+; CHECK-LABEL: minus_one_optsize:
+; CHECK: movl $-1
+; CHECK: ret
+}
+
+
+define void @zero_64(i64* %p) minsize {
+entry:
+ store i64 0, i64* %p
+ ret void
+
+; CHECK-LABEL: zero_64:
+; CHECK32: andl $0
+; CHECK32: andl $0
+; CHECK64: andq $0
+; CHECK: ret
+}
+
+define void @zero_32(i32* %p) minsize {
+entry:
+ store i32 0, i32* %p
+ ret void
+
+; CHECK-LABEL: zero_32:
+; CHECK: andl $0
+; CHECK: ret
+}
+
+define void @zero_16(i16* %p) minsize {
+entry:
+ store i16 0, i16* %p
+ ret void
+
+; CHECK-LABEL: zero_16:
+; CHECK: andw $0
+; CHECK: ret
+}
+
+
+define void @minus_one_64(i64* %p) minsize {
+entry:
+ store i64 -1, i64* %p
+ ret void
+
+; CHECK-LABEL: minus_one_64:
+; CHECK32: orl $-1
+; CHECK32: orl $-1
+; CHECK64: orq $-1
+; CHECK: ret
+}
+
+define void @minus_one_32(i32* %p) minsize {
+entry:
+ store i32 -1, i32* %p
+ ret void
+
+; CHECK-LABEL: minus_one_32:
+; CHECK: orl $-1
+; CHECK: ret
+}
+
+define void @minus_one_16(i16* %p) minsize {
+entry:
+ store i16 -1, i16* %p
+ ret void
+
+; CHECK-LABEL: minus_one_16:
+; CHECK: orw $-1
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/swift-return.ll b/test/CodeGen/X86/swift-return.ll
new file mode 100644
index 000000000000..cd028d0c16ad
--- /dev/null
+++ b/test/CodeGen/X86/swift-return.ll
@@ -0,0 +1,206 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-unknown-unknown -O0 | FileCheck --check-prefix=CHECK-O0 %s
+
+@var = global i32 0
+
+; Test how llvm handles return type of {i16, i8}. The return value will be
+; passed in %eax and %dl.
+; CHECK-LABEL: test:
+; CHECK: movl %edi
+; CHECK: callq gen
+; CHECK: movsbl %dl
+; CHECK: addl %{{.*}}, %eax
+; CHECK-O0-LABEL: test
+; CHECK-O0: movl %edi
+; CHECK-O0: callq gen
+; CHECK-O0: movswl %ax
+; CHECK-O0: movsbl %dl
+; CHECK-O0: addl
+; CHECK-O0: movw %{{.*}}, %ax
+define i16 @test(i32 %key) {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i16, i8 } @gen(i32 %0)
+ %v3 = extractvalue { i16, i8 } %call, 0
+ %v1 = sext i16 %v3 to i32
+ %v5 = extractvalue { i16, i8 } %call, 1
+ %v2 = sext i8 %v5 to i32
+ %add = add nsw i32 %v1, %v2
+ %conv = trunc i32 %add to i16
+ ret i16 %conv
+}
+
+declare swiftcc { i16, i8 } @gen(i32)
+
+; If we can't pass every return value in register, we will pass everything
+; in memroy. The caller provides space for the return value and passes
+; the address in %rax. The first input argument will be in %rdi.
+; CHECK-LABEL: test2:
+; CHECK: leaq (%rsp), %rax
+; CHECK: callq gen2
+; CHECK: movl (%rsp)
+; CHECK-DAG: addl 4(%rsp)
+; CHECK-DAG: addl 8(%rsp)
+; CHECK-DAG: addl 12(%rsp)
+; CHECK-DAG: addl 16(%rsp)
+; CHECK-O0-LABEL: test2:
+; CHECK-O0-DAG: leaq (%rsp), %rax
+; CHECK-O0: callq gen2
+; CHECK-O0-DAG: movl (%rsp)
+; CHECK-O0-DAG: movl 4(%rsp)
+; CHECK-O0-DAG: movl 8(%rsp)
+; CHECK-O0-DAG: movl 12(%rsp)
+; CHECK-O0-DAG: movl 16(%rsp)
+; CHECK-O0: addl
+; CHECK-O0: addl
+; CHECK-O0: addl
+; CHECK-O0: addl
+; CHECK-O0: movl %{{.*}}, %eax
+define i32 @test2(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32, i32 } %call, 3
+ %v8 = extractvalue { i32, i32, i32, i32, i32 } %call, 4
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ %add3 = add nsw i32 %add2, %v8
+ ret i32 %add3
+}
+
+; The address of the return value is passed in %rax.
+; On return, we don't keep the address in %rax.
+; CHECK-LABEL: gen2:
+; CHECK: movl %edi, 16(%rax)
+; CHECK: movl %edi, 12(%rax)
+; CHECK: movl %edi, 8(%rax)
+; CHECK: movl %edi, 4(%rax)
+; CHECK: movl %edi, (%rax)
+; CHECK-O0-LABEL: gen2:
+; CHECK-O0-DAG: movl %edi, 16(%rax)
+; CHECK-O0-DAG: movl %edi, 12(%rax)
+; CHECK-O0-DAG: movl %edi, 8(%rax)
+; CHECK-O0-DAG: movl %edi, 4(%rax)
+; CHECK-O0-DAG: movl %edi, (%rax)
+define swiftcc { i32, i32, i32, i32, i32 } @gen2(i32 %key) {
+ %Y = insertvalue { i32, i32, i32, i32, i32 } undef, i32 %key, 0
+ %Z = insertvalue { i32, i32, i32, i32, i32 } %Y, i32 %key, 1
+ %Z2 = insertvalue { i32, i32, i32, i32, i32 } %Z, i32 %key, 2
+ %Z3 = insertvalue { i32, i32, i32, i32, i32 } %Z2, i32 %key, 3
+ %Z4 = insertvalue { i32, i32, i32, i32, i32 } %Z3, i32 %key, 4
+ ret { i32, i32, i32, i32, i32 } %Z4
+}
+
+; The return value {i32, i32, i32, i32} will be returned via registers %eax,
+; %edx, %ecx, %r8d.
+; CHECK-LABEL: test3:
+; CHECK: callq gen3
+; CHECK: addl %edx, %eax
+; CHECK: addl %ecx, %eax
+; CHECK: addl %r8d, %eax
+; CHECK-O0-LABEL: test3:
+; CHECK-O0: callq gen3
+; CHECK-O0: addl %edx, %eax
+; CHECK-O0: addl %ecx, %eax
+; CHECK-O0: addl %r8d, %eax
+define i32 @test3(i32 %key) #0 {
+entry:
+ %key.addr = alloca i32, align 4
+ store i32 %key, i32* %key.addr, align 4
+ %0 = load i32, i32* %key.addr, align 4
+ %call = call swiftcc { i32, i32, i32, i32 } @gen3(i32 %0)
+
+ %v3 = extractvalue { i32, i32, i32, i32 } %call, 0
+ %v5 = extractvalue { i32, i32, i32, i32 } %call, 1
+ %v6 = extractvalue { i32, i32, i32, i32 } %call, 2
+ %v7 = extractvalue { i32, i32, i32, i32 } %call, 3
+
+ %add = add nsw i32 %v3, %v5
+ %add1 = add nsw i32 %add, %v6
+ %add2 = add nsw i32 %add1, %v7
+ ret i32 %add2
+}
+
+declare swiftcc { i32, i32, i32, i32 } @gen3(i32 %key)
+
+; The return value {float, float, float, float} will be returned via registers
+; %xmm0, %xmm1, %xmm2, %xmm3.
+; CHECK-LABEL: test4:
+; CHECK: callq gen4
+; CHECK: addss %xmm1, %xmm0
+; CHECK: addss %xmm2, %xmm0
+; CHECK: addss %xmm3, %xmm0
+; CHECK-O0-LABEL: test4:
+; CHECK-O0: callq gen4
+; CHECK-O0: addss %xmm1, %xmm0
+; CHECK-O0: addss %xmm2, %xmm0
+; CHECK-O0: addss %xmm3, %xmm0
+define float @test4(float %key) #0 {
+entry:
+ %key.addr = alloca float, align 4
+ store float %key, float* %key.addr, align 4
+ %0 = load float, float* %key.addr, align 4
+ %call = call swiftcc { float, float, float, float } @gen4(float %0)
+
+ %v3 = extractvalue { float, float, float, float } %call, 0
+ %v5 = extractvalue { float, float, float, float } %call, 1
+ %v6 = extractvalue { float, float, float, float } %call, 2
+ %v7 = extractvalue { float, float, float, float } %call, 3
+
+ %add = fadd float %v3, %v5
+ %add1 = fadd float %add, %v6
+ %add2 = fadd float %add1, %v7
+ ret float %add2
+}
+
+declare swiftcc { float, float, float, float } @gen4(float %key)
+
+; CHECK-LABEL: consume_i1_ret:
+; CHECK: callq produce_i1_ret
+; CHECK: andb $1, %al
+; CHECK: andb $1, %dl
+; CHECK: andb $1, %cl
+; CHECK: andb $1, %r8b
+; CHECK-O0-LABEL: consume_i1_ret:
+; CHECK-O0: callq produce_i1_ret
+; CHECK-O0: andb $1, %al
+; CHECK-O0: andb $1, %dl
+; CHECK-O0: andb $1, %cl
+; CHECK-O0: andb $1, %r8b
+define void @consume_i1_ret() {
+ %call = call swiftcc { i1, i1, i1, i1 } @produce_i1_ret()
+ %v3 = extractvalue { i1, i1, i1, i1 } %call, 0
+ %v5 = extractvalue { i1, i1, i1, i1 } %call, 1
+ %v6 = extractvalue { i1, i1, i1, i1 } %call, 2
+ %v7 = extractvalue { i1, i1, i1, i1 } %call, 3
+ %val = zext i1 %v3 to i32
+ store i32 %val, i32* @var
+ %val2 = zext i1 %v5 to i32
+ store i32 %val2, i32* @var
+ %val3 = zext i1 %v6 to i32
+ store i32 %val3, i32* @var
+ %val4 = zext i1 %v7 to i32
+ store i32 %val4, i32* @var
+ ret void
+}
+
+declare swiftcc { i1, i1, i1, i1 } @produce_i1_ret()
+
+; CHECK-LABEL: foo:
+; CHECK: movq %rdi, (%rax)
+; CHECK-O0-LABEL: foo:
+; CHECK-O0: movq %rdi, (%rax)
+define swiftcc void @foo(i64* sret %agg.result, i64 %val) {
+ store i64 %val, i64* %agg.result
+ ret void
+}
diff --git a/test/CodeGen/X86/swifterror.ll b/test/CodeGen/X86/swifterror.ll
new file mode 100644
index 000000000000..d8db36b09c25
--- /dev/null
+++ b/test/CodeGen/X86/swifterror.ll
@@ -0,0 +1,359 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-APPLE %s
+; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-O0 %s
+
+declare i8* @malloc(i64)
+declare void @free(i8*)
+%swift_error = type {i64, i8}
+
+; This tests the basic usage of a swifterror parameter. "foo" is the function
+; that takes a swifterror parameter and "caller" is the caller of "foo".
+define float @foo(%swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo:
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: movq %rax, %r12
+
+; CHECK-O0-LABEL: foo:
+; CHECK-O0: movl $16
+; CHECK-O0: malloc
+; CHECK-O0: movb $1, 8(%rax)
+; CHECK-O0: movq %{{.*}}, %r12
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+}
+
+; "caller" calls "foo" that takes a swifterror parameter.
+define float @caller(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: jne
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "caller2" is the caller of "foo", it calls "foo" inside a loop.
+define float @caller2(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller2:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; CHECK-APPLE: ucomiss
+; CHECK-APPLE: jbe
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller2:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: movq %r12, [[ID:%[a-z]+]]
+; CHECK-O0: cmpq $0, [[ID]]
+; CHECK-O0: jne
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ br label %bb_loop
+bb_loop:
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %cmp = fcmp ogt float %call, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; "foo_if" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition.
+define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
+; CHECK-APPLE-LABEL: foo_if:
+; CHECK-APPLE: testl %edi, %edi
+; CHECK-APPLE: je
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: movq %rax, %r12
+; CHECK-APPLE-NOT: %r12
+; CHECK-APPLE: ret
+
+; CHECK-O0-LABEL: foo_if:
+; CHECK-O0: cmpl $0
+; spill to stack
+; CHECK-O0: movq %r12, {{.*}}(%rsp)
+; CHECK-O0: je
+; CHECK-O0: movl $16,
+; CHECK-O0: malloc
+; CHECK-O0: movq %rax, [[ID:%[a-z]+]]
+; CHECK-O0-DAG: movb $1, 8(%rax)
+; CHECK-O0-DAG: movq [[ID]], %r12
+; CHECK-O0: ret
+; reload from stack
+; CHECK-O0: movq {{.*}}(%rsp), %r12
+; CHECK-O0: ret
+entry:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %normal
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ ret float 1.0
+
+normal:
+ ret float 0.0
+}
+
+; "foo_loop" is a function that takes a swifterror parameter, it sets swifterror
+; under a certain condition inside a loop.
+define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float %cc2) {
+; CHECK-APPLE-LABEL: foo_loop:
+; CHECK-APPLE: movq %r12, %rax
+; CHECK-APPLE: testl
+; CHECK-APPLE: je
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: ucomiss
+; CHECK-APPLE: jbe
+; CHECK-APPLE: movq %rax, %r12
+; CHECK-APPLE: ret
+
+; CHECK-O0-LABEL: foo_loop:
+; spill to stack
+; CHECK-O0: movq %r12, {{.*}}(%rsp)
+; CHECK-O0: cmpl $0
+; CHECK-O0: je
+; CHECK-O0: movl $16,
+; CHECK-O0: malloc
+; CHECK-O0: movq %rax, [[ID:%[a-z]+]]
+; CHECK-O0: movb $1, 8([[ID]])
+; CHECK-O0: jbe
+; reload from stack
+; CHECK-O0: movq {{.*}}(%rsp), %r12
+; CHECK-O0: ret
+entry:
+ br label %bb_loop
+
+bb_loop:
+ %cond = icmp ne i32 %cc, 0
+ br i1 %cond, label %gen_error, label %bb_cont
+
+gen_error:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ br label %bb_cont
+
+bb_cont:
+ %cmp = fcmp ogt float %cc2, 1.000000e+00
+ br i1 %cmp, label %bb_end, label %bb_loop
+bb_end:
+ ret float 0.0
+}
+
+%struct.S = type { i32, i32, i32, i32, i32, i32 }
+
+; "foo_sret" is a function that takes a swifterror parameter, it also has a sret
+; parameter.
+define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swifterror %error_ptr_ref) {
+; CHECK-APPLE-LABEL: foo_sret:
+; CHECK-APPLE: movq %rdi, %{{.*}}
+; CHECK-APPLE: movl $16, %edi
+; CHECK-APPLE: malloc
+; CHECK-APPLE: movb $1, 8(%rax)
+; CHECK-APPLE: movl %{{.*}}, 4(%{{.*}})
+; CHECK-APPLE: movq %rax, %r12
+; CHECK-APPLE: movq %{{.*}}, %rax
+; CHECK-APPLE-NOT: x19
+
+; CHECK-O0-LABEL: foo_sret:
+; CHECK-O0: movl $16,
+; spill sret to stack
+; CHECK-O0: movq %rdi,
+; CHECK-O0: movq {{.*}}, %rdi
+; CHECK-O0: malloc
+; CHECK-O0: movb $1, 8(%rax)
+; CHECK-O0: movl %{{.*}}, 4(%{{.*}})
+; CHECK-O0: movq %{{.*}}, %r12
+; reload sret from stack
+; CHECK-O0: movq {{.*}}(%rsp), %rax
+; CHECK-O0: ret
+entry:
+ %call = call i8* @malloc(i64 16)
+ %call.0 = bitcast i8* %call to %swift_error*
+ store %swift_error* %call.0, %swift_error** %error_ptr_ref
+ %tmp = getelementptr inbounds i8, i8* %call, i64 8
+ store i8 1, i8* %tmp
+ %v2 = getelementptr inbounds %struct.S, %struct.S* %agg.result, i32 0, i32 1
+ store i32 %val1, i32* %v2
+ ret void
+}
+
+; "caller3" calls "foo_sret" that takes a swifterror parameter.
+define float @caller3(i8* %error_ref) {
+; CHECK-APPLE-LABEL: caller3:
+; CHECK-APPLE: movl $1, %esi
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo_sret
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12),
+; CHECK-APPLE: movb %{{.*}},
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller3:
+; CHECK-O0: xorl
+; CHECK-O0: movl {{.*}}, %r12d
+; CHECK-O0: movl $1, %esi
+; CHECK-O0: movq {{.*}}, %rdi
+; CHECK-O0: callq {{.*}}foo_sret
+; CHECK-O0: movq %r12,
+; CHECK-O0: cmpq $0
+; CHECK-O0: jne
+; Access part of the error object and save it to error_ref
+; CHECK-O0: movb 8(%{{.*}}),
+; CHECK-O0: movb %{{.*}},
+; reload from stack
+; CHECK-O0: movq {{.*}}(%rsp), %rdi
+; CHECK-O0: callq {{.*}}free
+entry:
+ %s = alloca %struct.S, align 8
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ call void @foo_sret(%struct.S* sret %s, i32 1, %swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+ ret float 1.0
+}
+
+; This is a caller with multiple swifterror values, it calls "foo" twice, each
+; time with a different swifterror value, from "alloca swifterror".
+define float @caller_with_multiple_swifterror_values(i8* %error_ref, i8* %error_ref2) {
+; CHECK-APPLE-LABEL: caller_with_multiple_swifterror_values:
+
+; The first swifterror value:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; The second swifterror value:
+; CHECK-APPLE: xorl %r12d, %r12d
+; CHECK-APPLE: callq {{.*}}foo
+; CHECK-APPLE: testq %r12, %r12
+; CHECK-APPLE: jne
+; Access part of the error object and save it to error_ref
+; CHECK-APPLE: movb 8(%r12)
+; CHECK-APPLE: movq %r12, %rdi
+; CHECK_APPLE: callq {{.*}}free
+
+; CHECK-O0-LABEL: caller_with_multiple_swifterror_values:
+
+; The first swifterror value:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: jne
+
+; The second swifterror value:
+; CHECK-O0: xorl
+; CHECK-O0: movl %{{.*}}, %r12d
+; CHECK-O0: callq {{.*}}foo
+; CHECK-O0: jne
+entry:
+ %error_ptr_ref = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref
+ %call = call float @foo(%swift_error** swifterror %error_ptr_ref)
+ %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+ %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+ %tmp = bitcast %swift_error* %error_from_foo to i8*
+ br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+ %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+ %t = load i8, i8* %v1
+ store i8 %t, i8* %error_ref
+ br label %handler
+handler:
+ call void @free(i8* %tmp)
+
+ %error_ptr_ref2 = alloca swifterror %swift_error*
+ store %swift_error* null, %swift_error** %error_ptr_ref2
+ %call2 = call float @foo(%swift_error** swifterror %error_ptr_ref2)
+ %error_from_foo2 = load %swift_error*, %swift_error** %error_ptr_ref2
+ %had_error_from_foo2 = icmp ne %swift_error* %error_from_foo2, null
+ %bitcast2 = bitcast %swift_error* %error_from_foo2 to i8*
+ br i1 %had_error_from_foo2, label %handler2, label %cont2
+cont2:
+ %v2 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo2, i64 0, i32 1
+ %t2 = load i8, i8* %v2
+ store i8 %t2, i8* %error_ref2
+ br label %handler2
+handler2:
+ call void @free(i8* %bitcast2)
+
+ ret float 1.0
+}
diff --git a/test/CodeGen/X86/swiftself.ll b/test/CodeGen/X86/swiftself.ll
new file mode 100644
index 000000000000..c5e905945605
--- /dev/null
+++ b/test/CodeGen/X86/swiftself.ll
@@ -0,0 +1,62 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s
+
+; Parameter with swiftself should be allocated to r13.
+; CHECK-LABEL: swiftself_param:
+; CHECK: movq %r13, %rax
+define i8 *@swiftself_param(i8* swiftself %addr0) {
+ ret i8 *%addr0
+}
+
+; Check that r13 is used to pass a swiftself argument.
+; CHECK-LABEL: call_swiftself:
+; CHECK: movq %rdi, %r13
+; CHECK: callq {{_?}}swiftself_param
+define i8 *@call_swiftself(i8* %arg) {
+ %res = call i8 *@swiftself_param(i8* swiftself %arg)
+ ret i8 *%res
+}
+
+; r13 should be saved by the callee even if used for swiftself
+; CHECK-LABEL: swiftself_clobber:
+; CHECK: pushq %r13
+; ...
+; CHECK: popq %r13
+define i8 *@swiftself_clobber(i8* swiftself %addr0) {
+ call void asm sideeffect "nop", "~{r13}"()
+ ret i8 *%addr0
+}
+
+; Demonstrate that we do not need any movs when calling multiple functions
+; with swiftself argument.
+; CHECK-LABEL: swiftself_passthrough:
+; OPT-NOT: mov{{.*}}r13
+; OPT: callq {{_?}}swiftself_param
+; OPT-NOT: mov{{.*}}r13
+; OPT-NEXT: callq {{_?}}swiftself_param
+define void @swiftself_passthrough(i8* swiftself %addr0) {
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ call i8 *@swiftself_param(i8* swiftself %addr0)
+ ret void
+}
+
+; We can use a tail call if the callee swiftself is the same as the caller one.
+; CHECK-LABEL: swiftself_tail:
+; OPT: jmp {{_?}}swiftself_param
+; OPT-NOT: ret
+define i8* @swiftself_tail(i8* swiftself %addr0) {
+ call void asm sideeffect "", "~{r13}"()
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr0)
+ ret i8* %res
+}
+
+; We can not use a tail call if the callee swiftself is not the same as the
+; caller one.
+; CHECK-LABEL: swiftself_notail:
+; CHECK: movq %rdi, %r13
+; CHECK: callq {{_?}}swiftself_param
+; CHECK: retq
+define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
+ %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
+ ret i8* %res
+}
diff --git a/test/CodeGen/X86/switch-bt.ll b/test/CodeGen/X86/switch-bt.ll
index 6a2cbe1ec6ca..e4fbbeb26c3a 100644
--- a/test/CodeGen/X86/switch-bt.ll
+++ b/test/CodeGen/X86/switch-bt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -asm-verbose=false < %s -jump-table-density=40 | FileCheck %s
; This switch should use bit tests, and the third bit test case is just
; testing for one possible value, so it doesn't need a bt.
diff --git a/test/CodeGen/X86/switch-density.ll b/test/CodeGen/X86/switch-density.ll
new file mode 100644
index 000000000000..52216fb4d7c2
--- /dev/null
+++ b/test/CodeGen/X86/switch-density.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=25 | FileCheck %s --check-prefix=DENSE --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=10 | FileCheck %s --check-prefix=SPARSE --check-prefix=CHECK
+
+declare void @g(i32)
+
+define void @sparse(i32 %x) {
+entry:
+ switch i32 %x, label %return [
+ i32 300, label %bb0
+ i32 100, label %bb1
+ i32 400, label %bb1
+ i32 500, label %bb2
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Should pivot around 400 for two subtrees with two jump tables each.
+; CHECK-LABEL: sparse
+; CHECK-NOT: cmpl
+; CHECK: cmpl $399
+; CHECK: cmpl $100
+; CHECK: cmpl $300
+; CHECK: cmpl $400
+; CHECK: cmpl $500
+}
+
+define void @med(i32 %x) {
+entry:
+ switch i32 %x, label %return [
+ i32 30, label %bb0
+ i32 10, label %bb1
+ i32 40, label %bb1
+ i32 50, label %bb2
+ i32 20, label %bb3
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as a jump table when sparse, and branches when dense.
+; CHECK-LABEL: med
+; SPARSE: addl $-10
+; SPARSE: cmpl $40
+; SPARSE: ja
+; SPARSE: jmpq *.LJTI
+; DENSE-NOT: cmpl
+; DENSE: cmpl $29
+; DENSE-DAG: cmpl $10
+; DENSE-DAG: cmpl $20
+; DENSE-DAG: cmpl $30
+; DENSE-DAG: cmpl $40
+; DENSE-DAG: cmpl $50
+; DENSE: retq
+}
+
+define void @dense(i32 %x) {
+entry:
+ switch i32 %x, label %return [
+ i32 12, label %bb0
+ i32 4, label %bb1
+ i32 16, label %bb1
+ i32 20, label %bb2
+ i32 8, label %bb3
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as a jump table when sparse, and branches when dense.
+; CHECK-LABEL: dense
+; CHECK: addl $-4
+; CHECK: cmpl $16
+; CHECK: ja
+; CHECK: jmpq *.LJTI
+}
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index b8cb7b1280ad..3679433c372f 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -233,11 +233,11 @@ entry:
; block.
switch i32 %x, label %sw.default [
- i32 1, label %sw.bb
- i32 5, label %sw.bb2
- i32 7, label %sw.bb3
- i32 9, label %sw.bb4
- i32 31, label %sw.bb5
+ i32 4, label %sw.bb
+ i32 20, label %sw.bb2
+ i32 28, label %sw.bb3
+ i32 36, label %sw.bb4
+ i32 124, label %sw.bb5
], !prof !2
sw.bb:
@@ -272,7 +272,7 @@ sw.epilog:
;
; CHECK: BB#0:
; BB#0 to BB#6: [10, UINT32_MAX] (15)
-; BB#0 to BB#8: [1, 5, 7, 9] (jump table) (45)
+; BB#0 to BB#8: [4, 20, 28, 36] (jump table) (45)
; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
}
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index 896a067da230..6393c688e282 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s
; RUN: llc -mtriple=i686-pc-gnu-linux -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-JT-PROB
diff --git a/test/CodeGen/X86/switch.ll b/test/CodeGen/X86/switch.ll
index 46587341ea74..5d52f95e71cc 100644
--- a/test/CodeGen/X86/switch.ll
+++ b/test/CodeGen/X86/switch.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -O0 | FileCheck --check-prefix=NOOPT %s
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=40 -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -O0 -jump-table-density=40 -verify-machineinstrs | FileCheck --check-prefix=NOOPT %s
declare void @g(i32)
@@ -30,6 +30,47 @@ return: ret void
; NOOPT: jmpq
}
+; Should never be lowered as a jump table because of the attribute
+define void @basic_nojumptable(i32 %x) "no-jump-tables"="true" {
+entry:
+ switch i32 %x, label %return [
+ i32 3, label %bb0
+ i32 1, label %bb1
+ i32 4, label %bb1
+ i32 5, label %bb2
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Lowered as a jump table, both with and without optimization.
+; CHECK-LABEL: basic_nojumptable
+; CHECK-NOT: jmpq *.LJTI
+}
+
+; Should be lowered as a jump table because of the attribute
+define void @basic_nojumptable_false(i32 %x) "no-jump-tables"="false" {
+entry:
+ switch i32 %x, label %return [
+ i32 3, label %bb0
+ i32 1, label %bb1
+ i32 4, label %bb1
+ i32 5, label %bb2
+ ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+return: ret void
+
+; Lowered as a jump table, both with and without optimization.
+; CHECK-LABEL: basic_nojumptable_false
+; CHECK: decl
+; CHECK: cmpl $4
+; CHECK: ja
+; CHECK: jmpq *.LJTI
+}
+
define void @simple_ranges(i32 %x) {
entry:
@@ -47,6 +88,8 @@ bb0: tail call void @g(i32 0) br label %return
bb1: tail call void @g(i32 1) br label %return
return: ret void
+
+
; Should be lowered to two range checks.
; CHECK-LABEL: simple_ranges
; CHECK: leal -100
@@ -705,3 +748,33 @@ return: ret void
; Don't assert due to truncating the bitwidth (64) to i4 when checking
; that the bit-test range fits in a word.
}
+
+
+define i32 @pr27135(i32 %i) {
+entry:
+ br i1 undef, label %sw, label %end
+sw:
+ switch i32 %i, label %end [
+ i32 99, label %sw.bb
+ i32 98, label %sw.bb
+ i32 101, label %sw.bb
+ i32 97, label %sw.bb2
+ i32 96, label %sw.bb2
+ i32 100, label %sw.bb2
+ ]
+sw.bb:
+ unreachable
+sw.bb2:
+ unreachable
+end:
+ %p = phi i32 [ 1, %sw ], [ 0, %entry ]
+ ret i32 %p
+
+; CHECK-LABEL: pr27135:
+; The switch is lowered with bit tests. Since the case range is contiguous, the
+; second bit test is redundant and can be skipped. Check that we don't update
+; the phi node with an incoming value from the MBB of the skipped bit test
+; (-verify-machine-instrs cathces this).
+; CHECK: btl
+; CHECK-NOT: btl
+}
diff --git a/test/CodeGen/X86/tail-call-attrs.ll b/test/CodeGen/X86/tail-call-attrs.ll
index 17ebe997c8c1..90f1346de9aa 100644
--- a/test/CodeGen/X86/tail-call-attrs.ll
+++ b/test/CodeGen/X86/tail-call-attrs.ll
@@ -13,11 +13,11 @@ define zeroext i1 @test_bool() {
; Here, there's more zero extension to be done between the call and the return,
; so a tail call is impossible (well, according to current Clang practice
; anyway. The AMD64 ABI isn't crystal clear on the matter).
+; FIXME: The high 24 bits returned from test_i32 are undefined; do tail call!
declare zeroext i32 @give_i32()
define zeroext i8 @test_i32() {
; CHECK-LABEL: test_i32:
; CHECK: callq _give_i32
-; CHECK: movzbl %al, %eax
; CHECK: ret
%call = tail call zeroext i32 @give_i32()
@@ -27,11 +27,11 @@ define zeroext i8 @test_i32() {
; Here, one function is zeroext and the other is signext. To the extent that
; these both mean something they are incompatible so no tail call is possible.
+; FIXME: The high 16 bits returned are undefined; do tail call!
declare zeroext i16 @give_unsigned_i16()
define signext i16 @test_incompatible_i16() {
; CHECK-LABEL: test_incompatible_i16:
; CHECK: callq _give_unsigned_i16
-; CHECK: cwtl
; CHECK: ret
%call = tail call zeroext i16 @give_unsigned_i16()
diff --git a/test/CodeGen/X86/tail-call-casts.ll b/test/CodeGen/X86/tail-call-casts.ll
new file mode 100644
index 000000000000..5421b498e1ea
--- /dev/null
+++ b/test/CodeGen/X86/tail-call-casts.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -o - %s | FileCheck %s
+
+declare void @g_bool(i1 zeroext)
+define void @f_bool(i1 zeroext %x) {
+entry:
+ tail call void @g_bool(i1 zeroext %x)
+ ret void
+
+; Forwarding a bool in a tail call works.
+; CHECK-LABEL: f_bool:
+; CHECK-NOT: movz
+; CHECK: jmp g_bool
+}
+
+
+declare void @g_float(float)
+define void @f_i32(i32 %x) {
+entry:
+ %0 = bitcast i32 %x to float
+ tail call void @g_float(float %0)
+ ret void
+
+; Forwarding a bitcasted value works too.
+; CHECK-LABEL: f_i32
+; CHECK-NOT: mov
+; CHECK: jmp g_float
+}
diff --git a/test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll b/test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll
new file mode 100644
index 000000000000..73ce3b781f9d
--- /dev/null
+++ b/test/CodeGen/X86/tail-call-parameter-attrs-mismatch.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=i686-unknown-linux-gnu -o - %s | FileCheck %s
+
+declare void @f(i16 signext)
+declare void @g(i32 signext)
+
+
+define void @flags_match(i16 signext %x) {
+entry:
+ tail call void @f(i16 signext %x)
+ ret void
+
+; The parameter flags match; do the tail call.
+; CHECK-LABEL: flags_match:
+; CHECK: jmp f
+}
+
+define void @flags_mismatch(i16 zeroext %x) {
+entry:
+ tail call void @f(i16 signext %x)
+ ret void
+
+; The parameter flags mismatch. %x has not been sign-extended,
+; so tail call is not possible.
+; CHECK-LABEL: flags_mismatch:
+; CHECK: movswl
+; CHECK: calll f
+}
+
+
+define void @mismatch_doesnt_matter(i32 zeroext %x) {
+entry:
+ tail call void @g(i32 signext %x)
+ ret void
+
+; The parameter flags mismatch, but the type is wide enough that
+; no extension takes place in practice, so do the tail call.
+
+; CHECK-LABEL: mismatch_doesnt_matter:
+; CHECK: jmp g
+}
diff --git a/test/CodeGen/X86/tail-merge-unreachable.ll b/test/CodeGen/X86/tail-merge-unreachable.ll
new file mode 100644
index 000000000000..7b2c0f727215
--- /dev/null
+++ b/test/CodeGen/X86/tail-merge-unreachable.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -verify-machineinstrs | FileCheck %s
+
+define i32 @tail_merge_unreachable(i32 %i) {
+entry:
+ br i1 undef, label %sw, label %end
+sw:
+ switch i32 %i, label %end [
+ i32 99, label %sw.bb
+ i32 98, label %sw.bb
+ i32 101, label %sw.bb
+ i32 97, label %sw.bb2
+ i32 96, label %sw.bb2
+ i32 100, label %sw.bb2
+ ]
+sw.bb:
+ unreachable
+sw.bb2:
+ unreachable
+end:
+ %p = phi i32 [ 1, %sw ], [ 0, %entry ]
+ ret i32 %p
+
+; CHECK-LABEL: tail_merge_unreachable:
+; Range Check
+; CHECK: addl $-96
+; CHECK: cmpl $5
+; CHECK: jbe [[JUMP_TABLE_BLOCK:[.][A-Za-z0-9_]+]]
+; CHECK: retq
+; CHECK: [[JUMP_TABLE_BLOCK]]:
+; CHECK: btl
+; CHECK: jae [[UNREACHABLE_BLOCK:[.][A-Za-z0-9_]+]]
+; CHECK [[UNREACHABLE_BLOCK]]:
+; CHECK: .Lfunc_end0
+}
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index bf778e5bad2b..12c90c1a5fa9 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -376,7 +376,7 @@ return:
; CHECK-LABEL: two_minsize:
; CHECK-NOT: XYZ
; CHECK: ret
-; CHECK: movl $0, XYZ(%rip)
+; CHECK: andl $0, XYZ(%rip)
; CHECK: movl $1, XYZ(%rip)
; CHECK-NOT: XYZ
diff --git a/test/CodeGen/X86/tailcall-stackalign.ll b/test/CodeGen/X86/tailcall-stackalign.ll
index d3f811cff248..256477d52cde 100644
--- a/test/CodeGen/X86/tailcall-stackalign.ll
+++ b/test/CodeGen/X86/tailcall-stackalign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-unknown-linux -tailcallopt | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux -tailcallopt -no-x86-call-frame-opt | FileCheck %s
; Linux has 8 byte alignment so the params cause stack size 20 when tailcallopt
; is enabled, ensure that a normal fastcc call has matching stack size
diff --git a/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
new file mode 100644
index 000000000000..f6c49cab71b2
--- /dev/null
+++ b/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+tbm | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/tbm-builtins.c
+
+define i64 @test__bextri_u64(i64 %a0) {
+; X64-LABEL: test__bextri_u64:
+; X64: # BB#0:
+; X64-NEXT: bextr $1, %rdi, %rax
+; X64-NEXT: retq
+ %1 = call i64 @llvm.x86.tbm.bextri.u64(i64 %a0, i64 1)
+ ret i64 %1
+}
+
+define i64 @test__blcfill_u64(i64 %a0) {
+; X64-LABEL: test__blcfill_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = and i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blci_u64(i64 %a0) {
+; X64-LABEL: test__blci_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = xor i64 %1, -1
+ %3 = or i64 %a0, %2
+ ret i64 %3
+}
+
+define i64 @test__blcic_u64(i64 %a0) {
+; X64-LABEL: test__blcic_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: addq $1, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = add i64 %a0, 1
+ %3 = and i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @test__blcmsk_u64(i64 %a0) {
+; X64-LABEL: test__blcmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: xorq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = xor i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blcs_u64(i64 %a0) {
+; X64-LABEL: test__blcs_u64:
+; X64: # BB#0:
+; X64-NEXT: leaq 1(%rdi), %rax
+; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: retq
+ %1 = add i64 %a0, 1
+ %2 = or i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blsfill_u64(i64 %a0) {
+; X64-LABEL: test__blsfill_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: subq $1, %rax
+; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: retq
+ %1 = sub i64 %a0, 1
+ %2 = or i64 %a0, %1
+ ret i64 %2
+}
+
+define i64 @test__blsic_u64(i64 %a0) {
+; X64-LABEL: test__blsic_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: subq $1, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = sub i64 %a0, 1
+ %3 = or i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @test__t1mskc_u64(i64 %a0) {
+; X64-LABEL: test__t1mskc_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: addq $1, %rdi
+; X64-NEXT: orq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = add i64 %a0, 1
+ %3 = or i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @test__tzmsk_u64(i64 %a0) {
+; X64-LABEL: test__tzmsk_u64:
+; X64: # BB#0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: xorq $-1, %rax
+; X64-NEXT: subq $1, %rdi
+; X64-NEXT: andq %rax, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %1 = xor i64 %a0, -1
+ %2 = sub i64 %a0, 1
+ %3 = and i64 %1, %2
+ ret i64 %3
+}
+
+declare i64 @llvm.x86.tbm.bextri.u64(i64, i64)
diff --git a/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll b/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..a264adffe790
--- /dev/null
+++ b/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+tbm | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+tbm | FileCheck %s --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/tbm-builtins.c
+
+define i32 @test__bextri_u32(i32 %a0) {
+; X32-LABEL: test__bextri_u32:
+; X32: # BB#0:
+; X32-NEXT: bextr $1, {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__bextri_u32:
+; X64: # BB#0:
+; X64-NEXT: bextr $1, %edi, %eax
+; X64-NEXT: retq
+ %1 = call i32 @llvm.x86.tbm.bextri.u32(i32 %a0, i32 1)
+ ret i32 %1
+}
+
+define i32 @test__blcfill_u32(i32 %a0) {
+; X32-LABEL: test__blcfill_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcfill_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = and i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blci_u32(i32 %a0) {
+; X32-LABEL: test__blci_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: xorl $-1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blci_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = xor i32 %1, -1
+ %3 = or i32 %a0, %2
+ ret i32 %3
+}
+
+define i32 @test__blcic_u32(i32 %a0) {
+; X32-LABEL: test__blcic_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: addl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcic_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: addl $1, %edi
+; X64-NEXT: andl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = add i32 %a0, 1
+ %3 = and i32 %1, %2
+ ret i32 %3
+}
+
+define i32 @test__blcmsk_u32(i32 %a0) {
+; X32-LABEL: test__blcmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: xorl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = xor i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blcs_u32(i32 %a0) {
+; X32-LABEL: test__blcs_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: leal 1(%ecx), %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blcs_u32:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: leal 1(%rdi), %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: retq
+ %1 = add i32 %a0, 1
+ %2 = or i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blsfill_u32(i32 %a0) {
+; X32-LABEL: test__blsfill_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsfill_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: subl $1, %eax
+; X64-NEXT: orl %edi, %eax
+; X64-NEXT: retq
+ %1 = sub i32 %a0, 1
+ %2 = or i32 %a0, %1
+ ret i32 %2
+}
+
+define i32 @test__blsic_u32(i32 %a0) {
+; X32-LABEL: test__blsic_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__blsic_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: subl $1, %edi
+; X64-NEXT: orl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = sub i32 %a0, 1
+ %3 = or i32 %1, %2
+ ret i32 %3
+}
+
+define i32 @test__t1mskc_u32(i32 %a0) {
+; X32-LABEL: test__t1mskc_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: addl $1, %eax
+; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__t1mskc_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: addl $1, %edi
+; X64-NEXT: orl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = add i32 %a0, 1
+ %3 = or i32 %1, %2
+ ret i32 %3
+}
+
+define i32 @test__tzmsk_u32(i32 %a0) {
+; X32-LABEL: test__tzmsk_u32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: xorl $-1, %ecx
+; X32-NEXT: subl $1, %eax
+; X32-NEXT: andl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test__tzmsk_u32:
+; X64: # BB#0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl $-1, %eax
+; X64-NEXT: subl $1, %edi
+; X64-NEXT: andl %eax, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %1 = xor i32 %a0, -1
+ %2 = sub i32 %a0, 1
+ %3 = and i32 %1, %2
+ ret i32 %3
+}
+
+declare i32 @llvm.x86.tbm.bextri.u32(i32, i32)
diff --git a/test/CodeGen/X86/tls-android.ll b/test/CodeGen/X86/tls-android.ll
index 4156c7b3f5b9..53717f564fac 100644
--- a/test/CodeGen/X86/tls-android.ll
+++ b/test/CodeGen/X86/tls-android.ll
@@ -37,7 +37,7 @@ entry:
; CHECK-NOT: __emutls_v.external_x:
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK-LABEL: __emutls_v.external_y:
; CHECK-NEXT: .long 4
; CHECK-NEXT: .long 4
@@ -46,7 +46,7 @@ entry:
; CHECK-LABEL: __emutls_t.external_y:
; CHECK-NEXT: .long 7
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK-LABEL: __emutls_v.internal_y:
; CHECK-NEXT: .long 4
; CHECK-NEXT: .long 4
@@ -70,7 +70,7 @@ entry:
; X64-NOT: __emutls_v.external_x:
-; X64: .align 8
+; X64: .p2align 3
; X64-LABEL: __emutls_v.external_y:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
@@ -79,7 +79,7 @@ entry:
; X64-LABEL: __emutls_t.external_y:
; X64-NEXT: .long 7
-; X64: .align 8
+; X64: .p2align 3
; X64-LABEL: __emutls_v.internal_y:
; X64-NEXT: .quad 4
; X64-NEXT: .quad 4
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index 235230e3c6a8..30c219d691e6 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
@i = thread_local global i32 15
@@ -79,3 +79,7 @@ define i32* @f4() {
entry:
ret i32* @i2
}
+
+!llvm.module.flags = !{!0, !1}
+!0 = !{i32 1, !"PIC Level", i32 1}
+!1 = !{i32 1, !"PIE Level", i32 1}
diff --git a/test/CodeGen/X86/tls-windows-itanium.ll b/test/CodeGen/X86/tls-windows-itanium.ll
new file mode 100644
index 000000000000..20ac09901969
--- /dev/null
+++ b/test/CodeGen/X86/tls-windows-itanium.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-ASM
+; RUN: llc -mtriple i686-windows-itanium -filetype obj -o - %s | llvm-readobj -relocations - | FileCheck %s -check-prefix CHECK-OBJ
+
+@get_count_incremented.count = internal thread_local unnamed_addr global i32 0, align 4
+
+define i32 @get_count_incremented() {
+entry:
+ %0 = load i32, i32* @get_count_incremented.count, align 4
+ %inc = add i32 %0, 1
+ store i32 %inc, i32* @get_count_incremented.count, align 4
+ ret i32 %inc
+}
+
+; CHECK-ASM-LABEL: _get_count_incremented:
+; CHECK-ASM: movl __tls_index, %eax
+; CHECK-ASM: movl %fs:__tls_array, %ecx
+; CHECK-ASM: movl (%ecx,%eax,4), %ecx
+; CHECK-ASM: _get_count_incremented.count@SECREL32(%ecx), %eax
+; CHECK-ASM: incl %eax
+; CHECK-ASM: movl %eax, _get_count_incremented.count@SECREL32(%ecx)
+; CHECK-ASM: retl
+
+; CHECK-OBJ: Relocations [
+; CHECK-OBJ: Section ({{[0-9]+}}) .text {
+; CHECK-OBJ: 0x1 IMAGE_REL_I386_DIR32 __tls_index
+; CHECK-OBJ: 0x8 IMAGE_REL_I386_DIR32 __tls_array
+; CHECK-OBJ: 0x11 IMAGE_REL_I386_SECREL _get_count_incremented.count
+; CHECK-OBJ: 0x18 IMAGE_REL_I386_SECREL _get_count_incremented.count
+; CHECK-OBJ: }
+; CHECK-OBJ: ]
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 0f3d3adec4c3..85c51e618b2a 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -10,6 +10,7 @@
@i3 = internal thread_local global i32 15
@i4 = hidden thread_local global i32 15
@i5 = external hidden thread_local global i32
+@i6 = external protected thread_local global i32
@s1 = thread_local global i16 15
@b1 = thread_local global i8 0
@b2 = thread_local(localexec) global i8 0
@@ -438,3 +439,17 @@ entry:
ret i8* @b2
}
+
+define i32* @f16() {
+; X32_LINUX-LABEL: f16:
+; X32_LINUX: movl %gs:0, %eax
+; X32_LINUX-NEXT: leal i6@NTPOFF(%eax), %eax
+; X32_LINUX-NEXT: ret
+
+; X64_LINUX-LABEL: f16:
+; X64_LINUX: movq %fs:0, %rax
+; X64_LINUX-NEXT: leaq i6@TPOFF(%rax), %rax
+; X64_LINUX-NEXT: ret
+
+ ret i32* @i6
+}
diff --git a/test/CodeGen/X86/trunc-to-bool.ll b/test/CodeGen/X86/trunc-to-bool.ll
index 3dd98eea7fa9..3c99928824bc 100644
--- a/test/CodeGen/X86/trunc-to-bool.ll
+++ b/test/CodeGen/X86/trunc-to-bool.ll
@@ -1,14 +1,14 @@
; An integer truncation to i1 should be done with an and instruction to make
; sure only the LSBit survives. Test that this is the case both for a returned
; value and as the operand of a branch.
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
define zeroext i1 @test1(i32 %X) nounwind {
%Y = trunc i32 %X to i1
ret i1 %Y
}
; CHECK-LABEL: test1:
-; CHECK: andl $1, %eax
+; CHECK: andb $1, %al
define i1 @test2(i32 %val, i32 %mask) nounwind {
entry:
diff --git a/test/CodeGen/X86/twoaddr-coalesce.ll b/test/CodeGen/X86/twoaddr-coalesce.ll
index 38685ec27c02..c727f34cc9a5 100644
--- a/test/CodeGen/X86/twoaddr-coalesce.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 4
+; RUN: llc < %s -march=x86 | grep mov | count 2
; rdar://6523745
@"\01LC" = internal constant [4 x i8] c"%d\0A\00" ; <[4 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index 4b594f7c62ab..d2b78a8886f0 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s
; rdar://6504833
@@ -5,8 +6,8 @@ define float @test1(i32 %x) nounwind readnone {
; CHECK-LABEL: test1:
; CHECK: # BB#0: # %entry
; CHECK-NEXT: pushl %eax
-; CHECK-NEXT: movsd .LCPI0_0, %xmm0
-; CHECK-NEXT: movd {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: orpd %xmm0, %xmm1
; CHECK-NEXT: subsd %xmm0, %xmm1
; CHECK-NEXT: xorps %xmm0, %xmm0
@@ -16,8 +17,8 @@ define float @test1(i32 %x) nounwind readnone {
; CHECK-NEXT: popl %eax
; CHECK-NEXT: retl
entry:
- %0 = uitofp i32 %x to float
- ret float %0
+ %0 = uitofp i32 %x to float
+ ret float %0
}
; PR10802
@@ -26,8 +27,8 @@ define float @test2(<4 x i32> %x) nounwind readnone ssp {
; CHECK: # BB#0: # %entry
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: movss %xmm0, %xmm1
-; CHECK-NEXT: movsd .LCPI1_0, %xmm0
+; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: orps %xmm0, %xmm1
; CHECK-NEXT: subsd %xmm0, %xmm1
; CHECK-NEXT: xorps %xmm0, %xmm0
diff --git a/test/CodeGen/X86/uint_to_fp.ll b/test/CodeGen/X86/uint_to_fp.ll
index 0536eb05222c..a2784fdcbbdd 100644
--- a/test/CodeGen/X86/uint_to_fp.ll
+++ b/test/CodeGen/X86/uint_to_fp.ll
@@ -1,14 +1,27 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep "sub.*esp"
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep cvtsi2ss
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin8 -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin8 -mattr=+sse2 | FileCheck %s --check-prefix=X64
; rdar://6034396
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin8"
-
-define void @test(i32 %x, float* %y) nounwind {
+define void @test(i32 %x, float* %y) nounwind {
+; X32-LABEL: test:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: shrl $23, %ecx
+; X32-NEXT: cvtsi2ssl %ecx, %xmm0
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: shrl $23, %edi
+; X64-NEXT: cvtsi2ssl %edi, %xmm0
+; X64-NEXT: movss %xmm0, (%rsi)
+; X64-NEXT: retq
entry:
- lshr i32 %x, 23 ; <i32>:0 [#uses=1]
- uitofp i32 %0 to float ; <float>:1 [#uses=1]
- store float %1, float* %y
- ret void
+ lshr i32 %x, 23
+ uitofp i32 %0 to float
+ store float %1, float* %y
+ ret void
}
diff --git a/test/CodeGen/X86/umul-with-overflow.ll b/test/CodeGen/X86/umul-with-overflow.ll
index ba5a790f4380..29cecbe5a0f6 100644
--- a/test/CodeGen/X86/umul-with-overflow.ll
+++ b/test/CodeGen/X86/umul-with-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
define zeroext i1 @a(i32 %x) nounwind {
@@ -9,7 +9,6 @@ define zeroext i1 @a(i32 %x) nounwind {
; CHECK-LABEL: a:
; CHECK: mull
; CHECK: seto %al
-; CHECK: movzbl %al, %eax
; CHECK: ret
}
diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll
index ffbbcff2e5d6..644a36447302 100644
--- a/test/CodeGen/X86/unaligned-load.ll
+++ b/test/CodeGen/X86/unaligned-load.ll
@@ -30,8 +30,8 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
; COREI7: movups _.str3
; CORE2: .section
-; CORE2: .align 3
+; CORE2: .p2align 3
; CORE2-NEXT: _.str1:
; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
-; CORE2: .align 3
+; CORE2: .p2align 3
; CORE2-NEXT: _.str3:
diff --git a/test/CodeGen/X86/unaligned-spill-folding.ll b/test/CodeGen/X86/unaligned-spill-folding.ll
index dee94bce15a5..935c0b967f9e 100644
--- a/test/CodeGen/X86/unaligned-spill-folding.ll
+++ b/test/CodeGen/X86/unaligned-spill-folding.ll
@@ -34,7 +34,7 @@ middle.block:
; doesn't force stack realignment though
; UNALIGNED-LABEL: @test1
; UNALIGNED-NOT: andl $-{{..}}, %esp
-; UNALIGNED: movdqu {{.*}} # 16-byte Folded Spill
+; UNALIGNED: movdqu {{.*}} # 16-byte Spill
; UNALIGNED-NOT: paddd {{.*}} # 16-byte Folded Reload
; ALIGNED-LABEL: @test1
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index c41e529aa954..1058994d0ee1 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -22,15 +22,14 @@ entry:
!llvm.module.flags = !{!12}
!0 = !DILocalVariable(name: "x", line: 1, arg: 2, scope: !1, file: !2, type: !6)
-!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, scopeLine: 1, file: !10, scope: !2, type: !4)
+!1 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scopeLine: 1, file: !10, scope: !2, type: !4)
!2 = !DIFile(filename: "test.c", directory: "/dir")
-!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "producer", isOptimized: false, emissionKind: 0, file: !10, enums: !11, retainedTypes: !11, subprograms: !9)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "producer", isOptimized: false, emissionKind: FullDebug, file: !10, enums: !11, retainedTypes: !11)
!4 = !DISubroutineType(types: !5)
!5 = !{!6}
!6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!7 = distinct !DILexicalBlock(line: 1, column: 30, file: !10, scope: !1)
!8 = !DILocation(line: 4, column: 3, scope: !7)
-!9 = !{!1}
!10 = !DIFile(filename: "test.c", directory: "/dir")
!11 = !{}
!12 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/unreachableblockelim.ll b/test/CodeGen/X86/unreachableblockelim.ll
new file mode 100644
index 000000000000..49a075c32811
--- /dev/null
+++ b/test/CodeGen/X86/unreachableblockelim.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S < %s -unreachableblockelim | FileCheck %s
+; RUN: opt -S < %s -passes=unreachableblockelim | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @abort()
+
+; CHECK-LABEL: @foo(
+; CHECK-NOT return:
+define void @foo(i32* %p) {
+entry:
+ %p.addr = alloca i32*, align 8
+ call void @abort()
+ unreachable
+
+return: ; No predecessors!
+ store i32* %p, i32** %p.addr, align 8
+ ret void
+}
+
diff --git a/test/CodeGen/X86/unused_stackslots.ll b/test/CodeGen/X86/unused_stackslots.ll
new file mode 100644
index 000000000000..0bb904130f1c
--- /dev/null
+++ b/test/CodeGen/X86/unused_stackslots.ll
@@ -0,0 +1,246 @@
+; PR26374: Check no stack slots are allocated for vregs which have no real reference.
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ImageParameters = type { i32, i32, [0 x [16 x i16]] }
+%struct.InputParameters = type { i32, i32 }
+
+@c = common global %struct.ImageParameters* null, align 8
+@a = common global i16** null, align 8
+@d = common global [6 x i32] zeroinitializer, align 16
+@b = common global %struct.InputParameters* null, align 8
+@e = common global [4 x i32] zeroinitializer, align 16
+
+; It is not easy to check there is no unused holes in stack allocated for spills,
+; so simply check the size of stack allocated cannot exceed 350.
+; (408 is used before the fix for PR26374. 344 is used after the fix).
+;
+; CHECK-LABEL: @fn
+; CHECK: subq {{\$3[0-4][0-9]}}, %rsp
+
+; Function Attrs: nounwind uwtable
+define i32 @fn() #0 {
+entry:
+ %n = alloca [8 x [8 x i32]], align 16
+ %tmp = bitcast [8 x [8 x i32]]* %n to i8*
+ call void @llvm.lifetime.start(i64 256, i8* %tmp) #3
+ %tmp1 = bitcast [8 x [8 x i32]]* %n to i8*
+ %arraydecay.1 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 1, i64 0
+ %tmp2 = bitcast i32* %arraydecay.1 to i8*
+ %arraydecay.2 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 2, i64 0
+ %tmp3 = bitcast i32* %arraydecay.2 to i8*
+ %arraydecay.3 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 3, i64 0
+ %tmp4 = bitcast i32* %arraydecay.3 to i8*
+ %arraydecay.4 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 4, i64 0
+ %tmp5 = bitcast i32* %arraydecay.4 to i8*
+ %arraydecay.5 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 5, i64 0
+ %tmp6 = bitcast i32* %arraydecay.5 to i8*
+ %arraydecay.6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 6, i64 0
+ %tmp7 = bitcast i32* %arraydecay.6 to i8*
+ %arraydecay.7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 7, i64 0
+ %tmp8 = bitcast i32* %arraydecay.7 to i8*
+ br label %for.body
+
+for.body: ; preds = %for.inc73, %entry
+ %q.0131 = phi i32 [ 0, %entry ], [ %inc74, %for.inc73 ]
+ %m.0130 = phi i32 [ 0, %entry ], [ %m.4, %for.inc73 ]
+ %div = sdiv i32 %q.0131, 2
+ %shl = shl i32 %div, 3
+ %rem = srem i32 %q.0131, 2
+ %shl1 = shl nsw i32 %rem, 3
+ %tmp9 = sext i32 %shl1 to i64
+ %tmp10 = sext i32 %shl to i64
+ %tmp11 = or i32 %shl1, 4
+ %tmp12 = sext i32 %tmp11 to i64
+ %tmp13 = or i32 %shl, 4
+ %tmp14 = sext i32 %tmp13 to i64
+ br label %for.body4
+
+for.body4: ; preds = %for.inc48, %for.body
+ %indvars.iv148 = phi i64 [ %tmp10, %for.body ], [ %indvars.iv.next149, %for.inc48 ]
+ %m.1126 = phi i32 [ %m.0130, %for.body ], [ %m.3.lcssa, %for.inc48 ]
+ %tmp15 = load %struct.ImageParameters*, %struct.ImageParameters** @c, align 8
+ %opix_y = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp15, i64 0, i32 1
+ %tmp16 = load i32, i32* %opix_y, align 4
+ %tmp17 = trunc i64 %indvars.iv148 to i32
+ %add5 = add nsw i32 %tmp16, %tmp17
+ %tmp18 = sub nuw nsw i64 %indvars.iv148, %tmp10
+ %tmp19 = sext i32 %add5 to i64
+ %tmp20 = add nsw i64 %tmp19, 1
+ %tmp21 = or i64 %indvars.iv148, 1
+ %tmp22 = or i64 %tmp18, 1
+ %tmp23 = add nsw i64 %tmp19, 2
+ %tmp24 = or i64 %indvars.iv148, 2
+ %tmp25 = or i64 %tmp18, 2
+ %tmp26 = add nsw i64 %tmp19, 3
+ %tmp27 = or i64 %indvars.iv148, 3
+ %tmp28 = or i64 %tmp18, 3
+ br label %for.body9
+
+for.body9: ; preds = %for.inc45.for.body9_crit_edge, %for.body4
+ %tmp29 = phi %struct.ImageParameters* [ %tmp15, %for.body4 ], [ %.pre, %for.inc45.for.body9_crit_edge ]
+ %indvars.iv145 = phi i64 [ %tmp9, %for.body4 ], [ %indvars.iv.next146, %for.inc45.for.body9_crit_edge ]
+ %m.2124 = phi i32 [ %m.1126, %for.body4 ], [ %m.3, %for.inc45.for.body9_crit_edge ]
+ %opix_x = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp29, i64 0, i32 0
+ %tmp30 = load i32, i32* %opix_x, align 4
+ %tmp31 = trunc i64 %indvars.iv145 to i32
+ %add10 = add nsw i32 %tmp30, %tmp31
+ tail call void @LumaPrediction4x4(i32 %tmp31, i32 %tmp17, i32 0, i32 0, i32 0, i16 signext 0, i16 signext 0) #3
+ %tmp32 = load i16**, i16*** @a, align 8
+ %tmp33 = load %struct.ImageParameters*, %struct.ImageParameters** @c, align 8
+ %tmp34 = sub nuw nsw i64 %indvars.iv145, %tmp9
+ %tmp35 = sext i32 %add10 to i64
+ br label %for.cond14.preheader
+
+for.cond14.preheader: ; preds = %for.body9
+ %arrayidx = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp19
+ %tmp36 = load i16*, i16** %arrayidx, align 8
+ %arrayidx20 = getelementptr inbounds i16, i16* %tmp36, i64 %tmp35
+ %arrayidx26 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %indvars.iv148, i64 %indvars.iv145
+ %arrayidx35 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp18, i64 %tmp34
+ %tmp37 = bitcast i16* %arrayidx20 to <4 x i16>*
+ %tmp38 = load <4 x i16>, <4 x i16>* %tmp37, align 2
+ %tmp39 = zext <4 x i16> %tmp38 to <4 x i32>
+ %tmp40 = bitcast i16* %arrayidx26 to <4 x i16>*
+ %tmp41 = load <4 x i16>, <4 x i16>* %tmp40, align 2
+ %tmp42 = zext <4 x i16> %tmp41 to <4 x i32>
+ %tmp43 = sub nsw <4 x i32> %tmp39, %tmp42
+ %tmp44 = bitcast i32* %arrayidx35 to <4 x i32>*
+ store <4 x i32> %tmp43, <4 x i32>* %tmp44, align 16
+ store <4 x i32> %tmp43, <4 x i32>* bitcast ([6 x i32]* @d to <4 x i32>*), align 16
+ %arrayidx.1 = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp20
+ %tmp45 = load i16*, i16** %arrayidx.1, align 8
+ %arrayidx20.1 = getelementptr inbounds i16, i16* %tmp45, i64 %tmp35
+ %arrayidx26.1 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %tmp21, i64 %indvars.iv145
+ %arrayidx35.1 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp22, i64 %tmp34
+ %tmp46 = bitcast i16* %arrayidx20.1 to <4 x i16>*
+ %tmp47 = load <4 x i16>, <4 x i16>* %tmp46, align 2
+ %tmp48 = zext <4 x i16> %tmp47 to <4 x i32>
+ %tmp49 = bitcast i16* %arrayidx26.1 to <4 x i16>*
+ %tmp50 = load <4 x i16>, <4 x i16>* %tmp49, align 2
+ %tmp51 = zext <4 x i16> %tmp50 to <4 x i32>
+ %tmp52 = sub nsw <4 x i32> %tmp48, %tmp51
+ %tmp53 = bitcast i32* %arrayidx35.1 to <4 x i32>*
+ store <4 x i32> %tmp52, <4 x i32>* %tmp53, align 16
+ store <4 x i32> %tmp52, <4 x i32>* bitcast (i32* getelementptr inbounds ([6 x i32], [6 x i32]* @d, i64 0, i64 4) to <4 x i32>*), align 16
+ %arrayidx.2 = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp23
+ %tmp54 = load i16*, i16** %arrayidx.2, align 8
+ %arrayidx20.2 = getelementptr inbounds i16, i16* %tmp54, i64 %tmp35
+ %arrayidx26.2 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %tmp24, i64 %indvars.iv145
+ %arrayidx35.2 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp25, i64 %tmp34
+ %tmp55 = bitcast i16* %arrayidx20.2 to <4 x i16>*
+ %tmp56 = load <4 x i16>, <4 x i16>* %tmp55, align 2
+ %tmp57 = zext <4 x i16> %tmp56 to <4 x i32>
+ %tmp58 = bitcast i16* %arrayidx26.2 to <4 x i16>*
+ %tmp59 = load <4 x i16>, <4 x i16>* %tmp58, align 2
+ %tmp60 = zext <4 x i16> %tmp59 to <4 x i32>
+ %tmp61 = sub nsw <4 x i32> %tmp57, %tmp60
+ %tmp62 = bitcast i32* %arrayidx35.2 to <4 x i32>*
+ store <4 x i32> %tmp61, <4 x i32>* %tmp62, align 16
+ store <4 x i32> %tmp61, <4 x i32>* bitcast (i32* getelementptr ([6 x i32], [6 x i32]* @d, i64 1, i64 2) to <4 x i32>*), align 16
+ %arrayidx.3 = getelementptr inbounds i16*, i16** %tmp32, i64 %tmp26
+ %tmp63 = load i16*, i16** %arrayidx.3, align 8
+ %arrayidx20.3 = getelementptr inbounds i16, i16* %tmp63, i64 %tmp35
+ %arrayidx26.3 = getelementptr inbounds %struct.ImageParameters, %struct.ImageParameters* %tmp33, i64 0, i32 2, i64 %tmp27, i64 %indvars.iv145
+ %arrayidx35.3 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 %tmp28, i64 %tmp34
+ %tmp64 = bitcast i16* %arrayidx20.3 to <4 x i16>*
+ %tmp65 = load <4 x i16>, <4 x i16>* %tmp64, align 2
+ %tmp66 = zext <4 x i16> %tmp65 to <4 x i32>
+ %tmp67 = bitcast i16* %arrayidx26.3 to <4 x i16>*
+ %tmp68 = load <4 x i16>, <4 x i16>* %tmp67, align 2
+ %tmp69 = zext <4 x i16> %tmp68 to <4 x i32>
+ %tmp70 = sub nsw <4 x i32> %tmp66, %tmp69
+ %tmp71 = bitcast i32* %arrayidx35.3 to <4 x i32>*
+ store <4 x i32> %tmp70, <4 x i32>* %tmp71, align 16
+ store <4 x i32> %tmp70, <4 x i32>* bitcast (i32* getelementptr ([6 x i32], [6 x i32]* @d, i64 2, i64 0) to <4 x i32>*), align 16
+ %tmp72 = load %struct.InputParameters*, %struct.InputParameters** @b, align 8
+ %rdopt = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp72, i64 0, i32 0
+ %tmp73 = load i32, i32* %rdopt, align 4
+ %cmp42 = icmp eq i32 %tmp73, 0
+ br i1 %cmp42, label %land.lhs.true, label %if.then
+
+land.lhs.true: ; preds = %for.cond14.preheader
+ %Transform8x8Mode = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp72, i64 0, i32 1
+ %tmp74 = load i32, i32* %Transform8x8Mode, align 4
+ %tobool = icmp eq i32 %tmp74, 0
+ br i1 %tobool, label %if.then, label %for.inc45
+
+if.then: ; preds = %land.lhs.true, %for.cond14.preheader
+ %call = tail call i32 @distortion4x4(i32* nonnull getelementptr inbounds ([6 x i32], [6 x i32]* @d, i64 0, i64 0)) #3
+ %add44 = add nsw i32 %call, %m.2124
+ br label %for.inc45
+
+for.inc45: ; preds = %if.then, %land.lhs.true
+ %m.3 = phi i32 [ %m.2124, %land.lhs.true ], [ %add44, %if.then ]
+ %cmp8 = icmp slt i64 %indvars.iv145, %tmp12
+ br i1 %cmp8, label %for.inc45.for.body9_crit_edge, label %for.inc48
+
+for.inc45.for.body9_crit_edge: ; preds = %for.inc45
+ %indvars.iv.next146 = add nsw i64 %indvars.iv145, 4
+ %.pre = load %struct.ImageParameters*, %struct.ImageParameters** @c, align 8
+ br label %for.body9
+
+for.inc48: ; preds = %for.inc45
+ %m.3.lcssa = phi i32 [ %m.3, %for.inc45 ]
+ %indvars.iv.next149 = add nsw i64 %indvars.iv148, 4
+ %cmp3 = icmp slt i64 %indvars.iv148, %tmp14
+ br i1 %cmp3, label %for.body4, label %for.end50
+
+for.end50: ; preds = %for.inc48
+ %m.3.lcssa.lcssa = phi i32 [ %m.3.lcssa, %for.inc48 ]
+ %tmp75 = load %struct.InputParameters*, %struct.InputParameters** @b, align 8
+ %rdopt51 = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp75, i64 0, i32 0
+ %tmp76 = load i32, i32* %rdopt51, align 4
+ %cmp52 = icmp eq i32 %tmp76, 0
+ br i1 %cmp52, label %land.lhs.true54, label %for.inc73
+
+land.lhs.true54: ; preds = %for.end50
+ %Transform8x8Mode55 = getelementptr inbounds %struct.InputParameters, %struct.InputParameters* %tmp75, i64 0, i32 1
+ %tmp77 = load i32, i32* %Transform8x8Mode55, align 4
+ %tobool56 = icmp eq i32 %tmp77, 0
+ br i1 %tobool56, label %for.inc73, label %for.body61.preheader
+
+for.body61.preheader: ; preds = %land.lhs.true54
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 4, i64 0) to i8*), i8* %tmp1, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 6, i64 0) to i8*), i8* %tmp2, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 8, i64 0) to i8*), i8* %tmp3, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 10, i64 0) to i8*), i8* %tmp4, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 12, i64 0) to i8*), i8* %tmp5, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 14, i64 0) to i8*), i8* %tmp6, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 16, i64 0) to i8*), i8* %tmp7, i64 32, i32 16, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull bitcast (i32* getelementptr ([4 x i32], [4 x i32]* @e, i64 18, i64 0) to i8*), i8* %tmp8, i64 32, i32 16, i1 false)
+ %call70 = tail call i32 @distortion4x4(i32* nonnull getelementptr inbounds ([4 x i32], [4 x i32]* @e, i64 0, i64 0)) #3
+ %add71 = add nsw i32 %call70, %m.3.lcssa.lcssa
+ br label %for.inc73
+
+for.inc73: ; preds = %for.body61.preheader, %land.lhs.true54, %for.end50
+ %m.4 = phi i32 [ %add71, %for.body61.preheader ], [ %m.3.lcssa.lcssa, %land.lhs.true54 ], [ %m.3.lcssa.lcssa, %for.end50 ]
+ %inc74 = add nuw nsw i32 %q.0131, 1
+ %exitcond156 = icmp eq i32 %inc74, 4
+ br i1 %exitcond156, label %for.end75, label %for.body
+
+for.end75: ; preds = %for.inc73
+ %m.4.lcssa = phi i32 [ %m.4, %for.inc73 ]
+ call void @llvm.lifetime.end(i64 256, i8* %tmp) #3
+ ret i32 %m.4.lcssa
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare void @LumaPrediction4x4(i32, i32, i32, i32, i32, i16 signext, i16 signext) #2
+
+declare i32 @distortion4x4(i32*) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
diff --git a/test/CodeGen/X86/update-terminator.mir b/test/CodeGen/X86/update-terminator.mir
new file mode 100644
index 000000000000..1e75c6af9eb9
--- /dev/null
+++ b/test/CodeGen/X86/update-terminator.mir
@@ -0,0 +1,57 @@
+# RUN: llc -march=x86-64 -verify-machineinstrs -run-pass block-placement -o - %s | FileCheck %s
+# Check the conditional jump in bb.1 is changed to unconditional after block placement swaps bb.2 and bb.3.
+
+--- |
+ @a = external global i16
+ @b = external global i32
+
+ ; Function Attrs: nounwind
+ define void @f2() {
+ br i1 undef, label %bb1, label %bb3
+
+ bb1:
+ br i1 undef, label %bb2, label %bb2
+
+ bb2:
+ br label %bb4
+
+ bb3:
+ br label %bb2
+
+ bb4:
+ ret void
+ }
+
+
+...
+---
+# CHECK-LABEL: name: f2
+# CHECK: bb.1:
+# CHECK: JMP_1 %bb.2
+# CHECK: bb.3:
+# CHECK: bb.2:
+name: f2
+body: |
+ bb.0 (%ir-block.0):
+ successors: %bb.1(50), %bb.3(50)
+
+ JNE_1 %bb.1, implicit %eflags
+ JMP_1 %bb.3
+ bb.1:
+ successors: %bb.2(100)
+
+ JNE_1 %bb.2, implicit %eflags
+
+ bb.2:
+ successors: %bb.4(100)
+
+ JMP_1 %bb.4
+
+ bb.3:
+ successors: %bb.2(100)
+ JMP_1 %bb.2
+
+ bb.4:
+ RETQ
+
+...
diff --git a/test/CodeGen/X86/urem-i8-constant.ll b/test/CodeGen/X86/urem-i8-constant.ll
index e3cb69ca591f..45717f985c23 100644
--- a/test/CodeGen/X86/urem-i8-constant.ll
+++ b/test/CodeGen/X86/urem-i8-constant.ll
@@ -1,6 +1,21 @@
-; RUN: llc < %s -march=x86 | grep 111
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
define i8 @foo(i8 %tmp325) {
- %t546 = urem i8 %tmp325, 37
- ret i8 %t546
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: imull $111, %ecx, %eax
+; CHECK-NEXT: andl $28672, %eax # imm = 0x7000
+; CHECK-NEXT: shrl $12, %eax
+; CHECK-NEXT: movb $37, %dl
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: mulb %dl
+; CHECK-NEXT: subb %al, %cl
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retl
+;
+ %t546 = urem i8 %tmp325, 37
+ ret i8 %t546
}
+
diff --git a/test/CodeGen/X86/urem-power-of-two.ll b/test/CodeGen/X86/urem-power-of-two.ll
new file mode 100644
index 000000000000..9e27809c297d
--- /dev/null
+++ b/test/CodeGen/X86/urem-power-of-two.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; The easy case: a constant power-of-2 divisor.
+
+define i64 @const_pow_2(i64 %x) {
+; CHECK-LABEL: const_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andl $31, %edi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+;
+ %urem = urem i64 %x, 32
+ ret i64 %urem
+}
+
+; A left-shifted power-of-2 divisor. Use a weird type for wider coverage.
+
+define i25 @shift_left_pow_2(i25 %x, i25 %y) {
+; CHECK-LABEL: shift_left_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: shll %cl, %eax
+; CHECK-NEXT: addl $33554431, %eax # imm = 0x1FFFFFF
+; CHECK-NEXT: andl %edi, %eax
+; CHECK-NEXT: retq
+;
+ %shl = shl i25 1, %y
+ %urem = urem i25 %x, %shl
+ ret i25 %urem
+}
+
+; FIXME: A logically right-shifted sign bit is a power-of-2 or UB.
+
+define i16 @shift_right_pow_2(i16 %x, i16 %y) {
+; CHECK-LABEL: shift_right_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $32768, %r8d # imm = 0x8000
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: shrl %cl, %r8d
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: divw %r8w
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
+;
+ %shr = lshr i16 -32768, %y
+ %urem = urem i16 %x, %shr
+ ret i16 %urem
+}
+
+; FIXME: A zero divisor would be UB, so this could be reduced to an 'and' with 3.
+
+define i8 @and_pow_2(i8 %x, i8 %y) {
+; CHECK-LABEL: and_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andb $4, %sil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %sil
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: retq
+;
+ %and = and i8 %y, 4
+ %urem = urem i8 %x, %and
+ ret i8 %urem
+}
+
+; A vector splat constant divisor should get the same treatment as a scalar.
+
+define <4 x i32> @vec_const_pow_2(<4 x i32> %x) {
+; CHECK-LABEL: vec_const_pow_2:
+; CHECK: # BB#0:
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
+;
+ %urem = urem <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
+ ret <4 x i32> %urem
+}
+
diff --git a/test/CodeGen/X86/utf16-cfstrings.ll b/test/CodeGen/X86/utf16-cfstrings.ll
index 5f0e78fccc65..773efbcdefaa 100644
--- a/test/CodeGen/X86/utf16-cfstrings.ll
+++ b/test/CodeGen/X86/utf16-cfstrings.ll
@@ -9,7 +9,7 @@
@_unnamed_cfstring_ = private constant %struct.NSConstantString { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([5 x i16]* @.str to i8*), i64 4 }, section "__DATA,__cfstring"
; CHECK: .section __TEXT,__ustring
-; CHECK-NEXT: .align 1
+; CHECK-NEXT: .p2align 1
; CHECK-NEXT: _.str:
; CHECK-NEXT: .short 252 ## 0xfc
; CHECK-NEXT: .short 98 ## 0x62
diff --git a/test/CodeGen/X86/v4f32-immediate.ll b/test/CodeGen/X86/v4f32-immediate.ll
index 68d20a04ecf0..7945b1093f8e 100644
--- a/test/CodeGen/X86/v4f32-immediate.ll
+++ b/test/CodeGen/X86/v4f32-immediate.ll
@@ -1,7 +1,16 @@
-; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
-
-; CHECK: movaps
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse | FileCheck %s --check-prefix=X64
define <4 x float> @foo() {
+; X32-LABEL: foo:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [3.223542e+00,2.300000e+00,1.200000e+00,1.000000e-01]
+; X32-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [3.223542e+00,2.300000e+00,1.200000e+00,1.000000e-01]
+; X64-NEXT: retq
ret <4 x float> <float 0x4009C9D0A0000000, float 0x4002666660000000, float 0x3FF3333340000000, float 0x3FB99999A0000000>
}
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index 21fe96321987..0135832ad929 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -1,15 +1,36 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
-
-;CHECK-LABEL: and_masks:
-;CHECK: vmovaps
-;CHECK: vcmpltp
-;CHECK: vcmpltp
-;CHECK: vandps
-;CHECK: vandps
-;CHECK: vmovaps
-;CHECK: ret
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64
define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; X32-LABEL: and_masks:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: vmovups (%edx), %ymm0
+; X32-NEXT: vmovups (%ecx), %ymm1
+; X32-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
+; X32-NEXT: vmovups (%eax), %ymm2
+; X32-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vandps LCPI0_0, %ymm1, %ymm1
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: and_masks:
+; X64: ## BB#0:
+; X64-NEXT: vmovups (%rdi), %ymm0
+; X64-NEXT: vmovups (%rsi), %ymm1
+; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
+; X64-NEXT: vmovups (%rdx), %ymm2
+; X64-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: vmovaps %ymm0, (%rax)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%v0 = load <8 x float>, <8 x float>* %a, align 16
%v1 = load <8 x float>, <8 x float>* %b, align 16
%m0 = fcmp olt <8 x float> %v1, %v0
@@ -21,13 +42,30 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
ret void
}
-;CHECK: neg_mask
-;CHECK: vcmpltps
-;CHECK: vxorps
-;CHECK: vandps
-;CHECK: vmovaps
-;CHECK: ret
define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; X32-LABEL: neg_masks:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovups (%ecx), %ymm0
+; X32-NEXT: vcmpltps (%eax), %ymm0, %ymm0
+; X32-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: neg_masks:
+; X64: ## BB#0:
+; X64-NEXT: vmovups (%rsi), %ymm0
+; X64-NEXT: vcmpltps (%rdi), %ymm0, %ymm0
+; X64-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
+; X64-NEXT: vmovaps %ymm0, (%rax)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%v0 = load <8 x float>, <8 x float>* %a, align 16
%v1 = load <8 x float>, <8 x float>* %b, align 16
%m0 = fcmp olt <8 x float> %v1, %v0
diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll
index bb1104d85d87..7d93c332f61c 100644
--- a/test/CodeGen/X86/vararg-callee-cleanup.ll
+++ b/test/CodeGen/X86/vararg-callee-cleanup.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-pc-windows < %s | FileCheck %s
+; RUN: llc -mtriple=i686-pc-windows -no-x86-call-frame-opt < %s | FileCheck %s
target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
diff --git a/test/CodeGen/X86/vec-sign.ll b/test/CodeGen/X86/vec-sign.ll
deleted file mode 100644
index b3d85fd6ec7b..000000000000
--- a/test/CodeGen/X86/vec-sign.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem | FileCheck %s
-
-define <4 x i32> @signd(<4 x i32> %a, <4 x i32> %b) nounwind {
-entry:
-; CHECK-LABEL: signd:
-; CHECK: psignd
-; CHECK-NOT: sub
-; CHECK: ret
- %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <4 x i32> zeroinitializer, %a
- %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <4 x i32> %a, %0
- %2 = and <4 x i32> %b.lobit, %sub
- %cond = or <4 x i32> %1, %2
- ret <4 x i32> %cond
-}
-
-define <4 x i32> @blendvb(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) nounwind {
-entry:
-; CHECK-LABEL: blendvb:
-; CHECK: pblendvb
-; CHECK: ret
- %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
- %sub = sub nsw <4 x i32> zeroinitializer, %a
- %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
- %1 = and <4 x i32> %c, %0
- %2 = and <4 x i32> %a, %b.lobit
- %cond = or <4 x i32> %1, %2
- ret <4 x i32> %cond
-}
diff --git a/test/CodeGen/X86/vec_compare-sse4.ll b/test/CodeGen/X86/vec_compare-sse4.ll
index 084d61134206..714701897918 100644
--- a/test/CodeGen/X86/vec_compare-sse4.ll
+++ b/test/CodeGen/X86/vec_compare-sse4.ll
@@ -1,35 +1,66 @@
-; RUN: llc < %s -march=x86 -mattr=-sse3,+sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86 -mattr=-sse4.2,+sse4.1 | FileCheck %s -check-prefix=SSE41
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -check-prefix=SSE42
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=-sse3,+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=-sse4.2,+sse4.1 | FileCheck %s --check-prefix=SSE41
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42
define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
-; SSE42-LABEL: test1:
-; SSE42: pcmpgtq
-; SSE42: ret
-; SSE41-LABEL: test1:
-; SSE41-NOT: pcmpgtq
-; SSE41: ret
; SSE2-LABEL: test1:
-; SSE2-NOT: pcmpgtq
-; SSE2: ret
-
- %C = icmp sgt <2 x i64> %A, %B
+; SSE2: ## BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retl
+;
+; SSE41-LABEL: test1:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE41-NEXT: pxor %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm3, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; SSE42-LABEL: test1:
+; SSE42: ## BB#0:
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: retl
+ %C = icmp sgt <2 x i64> %A, %B
%D = sext <2 x i1> %C to <2 x i64>
- ret <2 x i64> %D
+ ret <2 x i64> %D
}
define <2 x i64> @test2(<2 x i64> %A, <2 x i64> %B) nounwind {
-; SSE42-LABEL: test2:
-; SSE42: pcmpeqq
-; SSE42: ret
-; SSE41-LABEL: test2:
-; SSE41: pcmpeqq
-; SSE41: ret
; SSE2-LABEL: test2:
-; SSE2-NOT: pcmpeqq
-; SSE2: ret
-
- %C = icmp eq <2 x i64> %A, %B
+; SSE2: ## BB#0:
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: retl
+;
+; SSE41-LABEL: test2:
+; SSE41: ## BB#0:
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE41-NEXT: retl
+;
+; SSE42-LABEL: test2:
+; SSE42: ## BB#0:
+; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
+; SSE42-NEXT: retl
+ %C = icmp eq <2 x i64> %A, %B
%D = sext <2 x i1> %C to <2 x i64>
- ret <2 x i64> %D
+ ret <2 x i64> %D
}
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 66114bc9c6bc..e151317c6585 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -121,22 +121,22 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind {
; CHECK-LABEL: prompop:
; CHECK: # BB#0:
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
; CHECK-NEXT: psubq %xmm1, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pand %xmm1, %xmm2
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: pand %xmm1, %xmm3
; CHECK-NEXT: psrlq $2, %xmm0
; CHECK-NEXT: pand %xmm1, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
+; CHECK-NEXT: paddq %xmm3, %xmm0
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $4, %xmm1
; CHECK-NEXT: paddq %xmm0, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT: pxor %xmm0, %xmm0
-; CHECK-NEXT: psadbw %xmm0, %xmm1
+; CHECK-NEXT: psadbw %xmm2, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
diff --git a/test/CodeGen/X86/vec_ext_inreg.ll b/test/CodeGen/X86/vec_ext_inreg.ll
index 02b16a79f4a0..1ee4b24b62f2 100644
--- a/test/CodeGen/X86/vec_ext_inreg.ll
+++ b/test/CodeGen/X86/vec_ext_inreg.ll
@@ -1,36 +1,108 @@
-; RUN: llc < %s -march=x86-64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
define <8 x i32> @a(<8 x i32> %a) nounwind {
+; SSE-LABEL: a:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: a:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpslld $16, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: a:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT: retq
%b = trunc <8 x i32> %a to <8 x i16>
%c = sext <8 x i16> %b to <8 x i32>
ret <8 x i32> %c
}
define <3 x i32> @b(<3 x i32> %a) nounwind {
+; SSE-LABEL: b:
+; SSE: # BB#0:
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: b:
+; AVX: # BB#0:
+; AVX-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT: retq
%b = trunc <3 x i32> %a to <3 x i16>
%c = sext <3 x i16> %b to <3 x i32>
ret <3 x i32> %c
}
define <1 x i32> @c(<1 x i32> %a) nounwind {
+; ALL-LABEL: c:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: retq
%b = trunc <1 x i32> %a to <1 x i16>
%c = sext <1 x i16> %b to <1 x i32>
ret <1 x i32> %c
}
define <8 x i32> @d(<8 x i32> %a) nounwind {
+; SSE-LABEL: d:
+; SSE: # BB#0:
+; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT: andps %xmm2, %xmm0
+; SSE-NEXT: andps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: d:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: d:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
%b = trunc <8 x i32> %a to <8 x i16>
%c = zext <8 x i16> %b to <8 x i32>
ret <8 x i32> %c
}
define <3 x i32> @e(<3 x i32> %a) nounwind {
+; SSE-LABEL: e:
+; SSE: # BB#0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: e:
+; AVX: # BB#0:
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
+; AVX-NEXT: retq
%b = trunc <3 x i32> %a to <3 x i16>
%c = zext <3 x i16> %b to <3 x i32>
ret <3 x i32> %c
}
define <1 x i32> @f(<1 x i32> %a) nounwind {
+; ALL-LABEL: f:
+; ALL: # BB#0:
+; ALL-NEXT: movzwl %di, %eax
+; ALL-NEXT: retq
%b = trunc <1 x i32> %a to <1 x i16>
%c = zext <1 x i16> %b to <1 x i32>
ret <1 x i32> %c
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll
index abb07233d35e..7286b4c403b9 100644
--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; When extracting multiple consecutive elements from a larger
; vector into a smaller one, do it efficiently. We should use
@@ -8,11 +9,18 @@
; Extracting the low elements only requires using the right kind of store.
define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
-; CHECK-LABEL: low_v8f32_to_v4f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps %xmm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: low_v8f32_to_v4f32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: low_v8f32_to_v4f32:
+; X64: # BB#0:
+; X64-NEXT: vmovaps %xmm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <8 x float> %v, i32 0
%ext1 = extractelement <8 x float> %v, i32 1
%ext2 = extractelement <8 x float> %v, i32 2
@@ -27,11 +35,18 @@ define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; Extracting the high elements requires just one AVX instruction.
define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
-; CHECK-LABEL: high_v8f32_to_v4f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: high_v8f32_to_v4f32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: high_v8f32_to_v4f32:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <8 x float> %v, i32 4
%ext1 = extractelement <8 x float> %v, i32 5
%ext2 = extractelement <8 x float> %v, i32 6
@@ -48,11 +63,18 @@ define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; if we were actually using the vector in this function and
; have AVX2, we should generate vextracti128 (the int version).
define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
-; CHECK-LABEL: high_v8i32_to_v4i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: high_v8i32_to_v4i32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: high_v8i32_to_v4i32:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <8 x i32> %v, i32 4
%ext1 = extractelement <8 x i32> %v, i32 5
%ext2 = extractelement <8 x i32> %v, i32 6
@@ -67,11 +89,18 @@ define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
; Make sure that element size doesn't alter the codegen.
define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
-; CHECK-LABEL: high_v4f64_to_v2f64:
-; CHECK: # BB#0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: high_v4f64_to_v2f64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: high_v4f64_to_v2f64:
+; X64: # BB#0:
+; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ext0 = extractelement <4 x double> %v, i32 2
%ext1 = extractelement <4 x double> %v, i32 3
%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
@@ -84,14 +113,25 @@ define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
; FIXME - Ideally these should just call VMOVD/VMOVQ/VMOVSS/VMOVSD
define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) {
-; CHECK-LABEL: legal_vzmovl_2i32_8i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; CHECK-NEXT: vmovaps %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2i32_8i32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2i32_8i32:
+; X64: # BB#0:
+; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X64-NEXT: vmovaps %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x i32>, <2 x i32>* %in, align 8
%ext = extractelement <2 x i32> %ld, i64 0
%ins = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %ext, i64 0
@@ -100,14 +140,25 @@ define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) {
}
define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
-; CHECK-LABEL: legal_vzmovl_2i64_4i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovupd (%rdi), %xmm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; CHECK-NEXT: vmovapd %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2i64_4i64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovupd (%ecx), %xmm0
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X32-NEXT: vmovapd %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2i64_4i64:
+; X64: # BB#0:
+; X64-NEXT: vmovupd (%rdi), %xmm0
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X64-NEXT: vmovapd %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x i64>, <2 x i64>* %in, align 8
%ext = extractelement <2 x i64> %ld, i64 0
%ins = insertelement <4 x i64> <i64 undef, i64 0, i64 0, i64 0>, i64 %ext, i64 0
@@ -116,14 +167,23 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
}
define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
-; CHECK-LABEL: legal_vzmovl_2f32_8f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; CHECK-NEXT: vmovaps %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2f32_8f32:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2f32_8f32:
+; X64: # BB#0:
+; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X64-NEXT: vmovaps %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x float>, <2 x float>* %in, align 8
%ext = extractelement <2 x float> %ld, i64 0
%ins = insertelement <8 x float> <float undef, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, float %ext, i64 0
@@ -132,14 +192,25 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
}
define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {
-; CHECK-LABEL: legal_vzmovl_2f64_4f64:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovupd (%rdi), %xmm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; CHECK-NEXT: vmovapd %ymm0, (%rsi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; X32-LABEL: legal_vzmovl_2f64_4f64:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovupd (%ecx), %xmm0
+; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X32-NEXT: vmovapd %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: legal_vzmovl_2f64_4f64:
+; X64: # BB#0:
+; X64-NEXT: vmovupd (%rdi), %xmm0
+; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X64-NEXT: vmovapd %ymm0, (%rsi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%ld = load <2 x double>, <2 x double>* %in, align 8
%ext = extractelement <2 x double> %ld, i64 0
%ins = insertelement <4 x double> <double undef, double 0.0, double 0.0, double 0.0>, double %ext, i64 0
diff --git a/test/CodeGen/X86/vec_extract-mmx.ll b/test/CodeGen/X86/vec_extract-mmx.ll
index 780066d2da15..329437cfedab 100644
--- a/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/test/CodeGen/X86/vec_extract-mmx.ll
@@ -1,12 +1,35 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
-define i32 @test0(<1 x i64>* %v4) {
-; CHECK-LABEL: test0:
-; CHECK: # BB#0:{{.*}} %entry
-; CHECK: pshufw $238, (%[[REG:[a-z]+]]), %mm0
-; CHECK-NEXT: movd %mm0, %eax
-; CHECK-NEXT: addl $32, %eax
-; CHECK-NEXT: retq
+define i32 @test0(<1 x i64>* %v4) nounwind {
+; X32-LABEL: test0:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl (%eax), %ecx
+; X32-NEXT: movl 4(%eax), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: movl %ecx, (%esp)
+; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3]
+; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: addl $32, %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test0:
+; X64: # BB#0: # %entry
+; X64-NEXT: pshufw $238, (%rdi), %mm0 # mm0 = mem[2,3,2,3]
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: addl $32, %eax
+; X64-NEXT: retq
entry:
%v5 = load <1 x i64>, <1 x i64>* %v4, align 8
%v12 = bitcast <1 x i64> %v5 to <4 x i16>
@@ -21,14 +44,32 @@ entry:
ret i32 %v20
}
-define i32 @test1(i32* nocapture readonly %ptr) {
-; CHECK-LABEL: test1:
-; CHECK: # BB#0:{{.*}} %entry
-; CHECK: movd (%[[REG]]), %mm0
-; CHECK-NEXT: pshufw $232, %mm0, %mm0
-; CHECK-NEXT: movd %mm0, %eax
-; CHECK-NEXT: emms
-; CHECK-NEXT: retq
+define i32 @test1(i32* nocapture readonly %ptr) nounwind {
+; X32-LABEL: test1:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movd (%eax), %mm0
+; X32-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
+; X32-NEXT: movq %mm0, (%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: emms
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd (%rdi), %mm0
+; X64-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: emms
+; X64-NEXT: retq
entry:
%0 = load i32, i32* %ptr, align 4
%1 = insertelement <2 x i32> undef, i32 %0, i32 0
@@ -47,13 +88,30 @@ entry:
ret i32 %12
}
-define i32 @test2(i32* nocapture readonly %ptr) {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0:{{.*}} %entry
-; CHECK: pshufw $232, (%[[REG]]), %mm0
-; CHECK-NEXT: movd %mm0, %eax
-; CHECK-NEXT: emms
-; CHECK-NEXT: retq
+define i32 @test2(i32* nocapture readonly %ptr) nounwind {
+; X32-LABEL: test2:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3]
+; X32-NEXT: movq %mm0, (%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: emms
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0: # %entry
+; X64-NEXT: pshufw $232, (%rdi), %mm0 # mm0 = mem[0,2,2,3]
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: emms
+; X64-NEXT: retq
entry:
%0 = bitcast i32* %ptr to x86_mmx*
%1 = load x86_mmx, x86_mmx* %0, align 8
@@ -67,5 +125,48 @@ entry:
ret i32 %7
}
+define i32 @test3(x86_mmx %a) nounwind {
+; X32-LABEL: test3:
+; X32: # BB#0:
+; X32-NEXT: movd %mm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0:
+; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: retq
+ %tmp0 = bitcast x86_mmx %a to <2 x i32>
+ %tmp1 = extractelement <2 x i32> %tmp0, i32 0
+ ret i32 %tmp1
+}
+
+; Verify we don't muck with extractelts from the upper lane.
+define i32 @test4(x86_mmx %a) nounwind {
+; X32-LABEL: test4:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: movq %mm0, (%esp)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # BB#0:
+; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: retq
+ %tmp0 = bitcast x86_mmx %a to <2 x i32>
+ %tmp1 = extractelement <2 x i32> %tmp0, i32 1
+ ret i32 %tmp1
+}
+
declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 9f4210f7847e..f073f1538d2e 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,60 +1,79 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
define void @t1(float* %R, <4 x float>* %P1) nounwind {
-; CHECK-LABEL: t1:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movss 12(%ecx), %xmm0
-; CHECK-NEXT: movss %xmm0, (%eax)
-; CHECK-NEXT: retl
-
- %X = load <4 x float>, <4 x float>* %P1
- %tmp = extractelement <4 x float> %X, i32 3
- store float %tmp, float* %R
- ret void
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss %xmm0, (%rdi)
+; X64-NEXT: retq
+ %X = load <4 x float>, <4 x float>* %P1
+ %tmp = extractelement <4 x float> %X, i32 3
+ store float %tmp, float* %R
+ ret void
}
define float @t2(<4 x float>* %P1) nounwind {
-; CHECK-LABEL: t2:
-; CHECK: # BB#0:
-; CHECK-NEXT: pushl %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movapd (%eax), %xmm0
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: movss %xmm0, (%esp)
-; CHECK-NEXT: flds (%esp)
-; CHECK-NEXT: popl %eax
-; CHECK-NEXT: retl
-
- %X = load <4 x float>, <4 x float>* %P1
- %tmp = extractelement <4 x float> %X, i32 2
- ret float %tmp
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; X64-NEXT: retq
+ %X = load <4 x float>, <4 x float>* %P1
+ %tmp = extractelement <4 x float> %X, i32 2
+ ret float %tmp
}
define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
-; CHECK-LABEL: t3:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl 12(%ecx), %ecx
-; CHECK-NEXT: movl %ecx, (%eax)
-; CHECK-NEXT: retl
-
- %X = load <4 x i32>, <4 x i32>* %P1
- %tmp = extractelement <4 x i32> %X, i32 3
- store i32 %tmp, i32* %R
- ret void
+; X32-LABEL: t3:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl 12(%ecx), %ecx
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0:
+; X64-NEXT: movl 12(%rsi), %eax
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: retq
+ %X = load <4 x i32>, <4 x i32>* %P1
+ %tmp = extractelement <4 x i32> %X, i32 3
+ store i32 %tmp, i32* %R
+ ret void
}
define i32 @t4(<4 x i32>* %P1) nounwind {
-; CHECK-LABEL: t4:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl 12(%eax), %eax
-; CHECK-NEXT: retl
-
- %X = load <4 x i32>, <4 x i32>* %P1
- %tmp = extractelement <4 x i32> %X, i32 3
- ret i32 %tmp
+; X32-LABEL: t4:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 12(%eax), %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: t4:
+; X64: # BB#0:
+; X64-NEXT: movl 12(%rdi), %eax
+; X64-NEXT: retq
+ %X = load <4 x i32>, <4 x i32>* %P1
+ %tmp = extractelement <4 x i32> %X, i32 3
+ ret i32 %tmp
}
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 3b478880590d..47f719d9e32e 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,74 +1,104 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define void @test1(<4 x float>* %F, float* %f) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movaps (%ecx), %xmm0
-; CHECK-NEXT: addps %xmm0, %xmm0
-; CHECK-NEXT: movss %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X32-LABEL: test1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movaps (%ecx), %xmm0
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: movss %xmm0, (%rsi)
+; X64-NEXT: retq
entry:
- %tmp = load <4 x float>, <4 x float>* %F ; <<4 x float>> [#uses=2]
- %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1]
- %tmp2 = extractelement <4 x float> %tmp7, i32 0 ; <float> [#uses=1]
- store float %tmp2, float* %f
- ret void
+ %tmp = load <4 x float>, <4 x float>* %F
+ %tmp7 = fadd <4 x float> %tmp, %tmp
+ %tmp2 = extractelement <4 x float> %tmp7, i32 0
+ store float %tmp2, float* %f
+ ret void
}
define float @test2(<4 x float>* %F, float* %f) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: pushl %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movaps (%eax), %xmm0
-; CHECK-NEXT: addps %xmm0, %xmm0
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: movss %xmm0, (%esp)
-; CHECK-NEXT: flds (%esp)
-; CHECK-NEXT: popl %eax
-; CHECK-NEXT: retl
+; X32-LABEL: test2:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: addps %xmm0, %xmm0
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: addps %xmm0, %xmm0
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: retq
entry:
- %tmp = load <4 x float>, <4 x float>* %F ; <<4 x float>> [#uses=2]
- %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1]
- %tmp2 = extractelement <4 x float> %tmp7, i32 2 ; <float> [#uses=1]
- ret float %tmp2
+ %tmp = load <4 x float>, <4 x float>* %F
+ %tmp7 = fadd <4 x float> %tmp, %tmp
+ %tmp2 = extractelement <4 x float> %tmp7, i32 2
+ ret float %tmp2
}
define void @test3(float* %R, <4 x float>* %P1) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movss 12(%ecx), %xmm0
-; CHECK-NEXT: movss %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X32-LABEL: test3:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0: # %entry
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
- %X = load <4 x float>, <4 x float>* %P1 ; <<4 x float>> [#uses=1]
- %tmp = extractelement <4 x float> %X, i32 3 ; <float> [#uses=1]
- store float %tmp, float* %R
- ret void
+ %X = load <4 x float>, <4 x float>* %P1
+ %tmp = extractelement <4 x float> %X, i32 3
+ store float %tmp, float* %R
+ ret void
}
define double @test4(double %A) nounwind {
-; CHECK-LABEL: test4:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: subl $12, %esp
-; CHECK-NEXT: calll foo
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
-; CHECK-NEXT: movsd %xmm0, (%esp)
-; CHECK-NEXT: fldl (%esp)
-; CHECK-NEXT: addl $12, %esp
-; CHECK-NEXT: retl
+; X32-LABEL: test4:
+; X32: # BB#0: # %entry
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: calll foo
+; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: movsd %xmm0, (%esp)
+; X32-NEXT: fldl (%esp)
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # BB#0: # %entry
+; X64-NEXT: pushq %rax
+; X64-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
+; X64-NEXT: callq foo
+; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
entry:
- %tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1]
- %tmp2 = extractelement <2 x double> %tmp1, i32 1 ; <double> [#uses=1]
- %tmp3 = fadd double %tmp2, %A ; <double> [#uses=1]
- ret double %tmp3
+ %tmp1 = call <2 x double> @foo( )
+ %tmp2 = extractelement <2 x double> %tmp1, i32 1
+ %tmp3 = fadd double %tmp2, %A
+ ret double %tmp3
}
declare <2 x double> @foo()
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 54f33b2bd224..0f5e09914890 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -1,37 +1,64 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64
-define <2 x double> @fabs_v2f64(<2 x double> %p)
-{
- ; CHECK-LABEL: fabs_v2f64
- ; CHECK: vandpd
+define <2 x double> @fabs_v2f64(<2 x double> %p) {
+; X32-LABEL: fabs_v2f64:
+; X32: # BB#0:
+; X32-NEXT: vandpd .LCPI0_0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v2f64:
+; X64: # BB#0:
+; X64-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
%t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
-define <4 x float> @fabs_v4f32(<4 x float> %p)
-{
- ; CHECK-LABEL: fabs_v4f32
- ; CHECK: vandps
+define <4 x float> @fabs_v4f32(<4 x float> %p) {
+; X32-LABEL: fabs_v4f32:
+; X32: # BB#0:
+; X32-NEXT: vandps .LCPI1_0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v4f32:
+; X64: # BB#0:
+; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: retq
%t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
-define <4 x double> @fabs_v4f64(<4 x double> %p)
-{
- ; CHECK-LABEL: fabs_v4f64
- ; CHECK: vandpd
+define <4 x double> @fabs_v4f64(<4 x double> %p) {
+; X32-LABEL: fabs_v4f64:
+; X32: # BB#0:
+; X32-NEXT: vandpd .LCPI2_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v4f64:
+; X64: # BB#0:
+; X64-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
%t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
-define <8 x float> @fabs_v8f32(<8 x float> %p)
-{
- ; CHECK-LABEL: fabs_v8f32
- ; CHECK: vandps
+define <8 x float> @fabs_v8f32(<8 x float> %p) {
+; X32-LABEL: fabs_v8f32:
+; X32: # BB#0:
+; X32-NEXT: vandps .LCPI3_0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v8f32:
+; X64: # BB#0:
+; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
%t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
ret <8 x float> %t
}
@@ -44,7 +71,7 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
; that has the sign bits turned off.
;
; So instead of something like this:
-; movabsq (constant pool load of mask for sign bits)
+; movabsq (constant pool load of mask for sign bits)
; vmovq (move from integer register to vector/fp register)
; vandps (mask off sign bits)
; vmovq (move vector/fp register back to integer return register)
@@ -53,9 +80,16 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
; mov (put constant value in return register)
define i64 @fabs_v2f32_1() {
-; CHECK-LABEL: fabs_v2f32_1:
-; CHECK: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
-; CHECK-NEXT: retq
+; X32-LABEL: fabs_v2f32_1:
+; X32: # BB#0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v2f32_1:
+; X64: # BB#0:
+; X64-NEXT: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
+; X64-NEXT: retq
%bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
%ret = bitcast <2 x float> %fabs to i64
@@ -63,9 +97,16 @@ define i64 @fabs_v2f32_1() {
}
define i64 @fabs_v2f32_2() {
-; CHECK-LABEL: fabs_v2f32_2:
-; CHECK: movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-NEXT: retq
+; X32-LABEL: fabs_v2f32_2:
+; X32: # BB#0:
+; X32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: fabs_v2f32_2:
+; X64: # BB#0:
+; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT: retq
%bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
%ret = bitcast <2 x float> %fabs to i64
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index f35c4ab4a76e..4fa79bc7fa8b 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -1,181 +1,312 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
-
-define <2 x double> @floor_v2f64(<2 x double> %p)
-{
- ; CHECK: floor_v2f64
- ; CHECK: vroundpd
+define <2 x double> @floor_v2f64(<2 x double> %p) {
+; SSE41-LABEL: floor_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
-define <4 x float> @floor_v4f32(<4 x float> %p)
-{
- ; CHECK: floor_v4f32
- ; CHECK: vroundps
+define <4 x float> @floor_v4f32(<4 x float> %p) {
+; SSE41-LABEL: floor_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $9, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $9, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
-define <4 x double> @floor_v4f64(<4 x double> %p)
-{
- ; CHECK: floor_v4f64
- ; CHECK: vroundpd
+define <4 x double> @floor_v4f64(<4 x double> %p){
+; SSE41-LABEL: floor_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
-define <8 x float> @floor_v8f32(<8 x float> %p)
-{
- ; CHECK: floor_v8f32
- ; CHECK: vroundps
+define <8 x float> @floor_v8f32(<8 x float> %p) {
+; SSE41-LABEL: floor_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $9, %xmm0, %xmm0
+; SSE41-NEXT: roundps $9, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $9, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
-define <2 x double> @ceil_v2f64(<2 x double> %p)
-{
- ; CHECK: ceil_v2f64
- ; CHECK: vroundpd
+define <2 x double> @ceil_v2f64(<2 x double> %p) {
+; SSE41-LABEL: ceil_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
-define <4 x float> @ceil_v4f32(<4 x float> %p)
-{
- ; CHECK: ceil_v4f32
- ; CHECK: vroundps
+define <4 x float> @ceil_v4f32(<4 x float> %p) {
+; SSE41-LABEL: ceil_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $10, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $10, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
-define <4 x double> @ceil_v4f64(<4 x double> %p)
-{
- ; CHECK: ceil_v4f64
- ; CHECK: vroundpd
+define <4 x double> @ceil_v4f64(<4 x double> %p) {
+; SSE41-LABEL: ceil_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
-define <8 x float> @ceil_v8f32(<8 x float> %p)
-{
- ; CHECK: ceil_v8f32
- ; CHECK: vroundps
+define <8 x float> @ceil_v8f32(<8 x float> %p) {
+; SSE41-LABEL: ceil_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $10, %xmm0, %xmm0
+; SSE41-NEXT: roundps $10, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $10, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
-define <2 x double> @trunc_v2f64(<2 x double> %p)
-{
- ; CHECK: trunc_v2f64
- ; CHECK: vroundpd
+define <2 x double> @trunc_v2f64(<2 x double> %p) {
+; SSE41-LABEL: trunc_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $11, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
-define <4 x float> @trunc_v4f32(<4 x float> %p)
-{
- ; CHECK: trunc_v4f32
- ; CHECK: vroundps
+define <4 x float> @trunc_v4f32(<4 x float> %p) {
+; SSE41-LABEL: trunc_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $11, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $11, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
-define <4 x double> @trunc_v4f64(<4 x double> %p)
-{
- ; CHECK: trunc_v4f64
- ; CHECK: vroundpd
+define <4 x double> @trunc_v4f64(<4 x double> %p) {
+; SSE41-LABEL: trunc_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
-define <8 x float> @trunc_v8f32(<8 x float> %p)
-{
- ; CHECK: trunc_v8f32
- ; CHECK: vroundps
+define <8 x float> @trunc_v8f32(<8 x float> %p) {
+; SSE41-LABEL: trunc_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $11, %xmm0, %xmm0
+; SSE41-NEXT: roundps $11, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
-define <2 x double> @rint_v2f64(<2 x double> %p)
-{
- ; CHECK: rint_v2f64
- ; CHECK: vroundpd
+define <2 x double> @rint_v2f64(<2 x double> %p) {
+; SSE41-LABEL: rint_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $4, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.rint.v2f64(<2 x double> %p)
-define <4 x float> @rint_v4f32(<4 x float> %p)
-{
- ; CHECK: rint_v4f32
- ; CHECK: vroundps
+define <4 x float> @rint_v4f32(<4 x float> %p) {
+; SSE41-LABEL: rint_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $4, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $4, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.rint.v4f32(<4 x float> %p)
-define <4 x double> @rint_v4f64(<4 x double> %p)
-{
- ; CHECK: rint_v4f64
- ; CHECK: vroundpd
+define <4 x double> @rint_v4f64(<4 x double> %p) {
+; SSE41-LABEL: rint_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.rint.v4f64(<4 x double> %p)
-define <8 x float> @rint_v8f32(<8 x float> %p)
-{
- ; CHECK: rint_v8f32
- ; CHECK: vroundps
+define <8 x float> @rint_v8f32(<8 x float> %p) {
+; SSE41-LABEL: rint_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $4, %xmm0, %xmm0
+; SSE41-NEXT: roundps $4, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.rint.v8f32(<8 x float> %p)
-define <2 x double> @nearbyint_v2f64(<2 x double> %p)
-{
- ; CHECK: nearbyint_v2f64
- ; CHECK: vroundpd
+define <2 x double> @nearbyint_v2f64(<2 x double> %p) {
+; SSE41-LABEL: nearbyint_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $12, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
ret <2 x double> %t
}
declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
-define <4 x float> @nearbyint_v4f32(<4 x float> %p)
-{
- ; CHECK: nearbyint_v4f32
- ; CHECK: vroundps
+define <4 x float> @nearbyint_v4f32(<4 x float> %p) {
+; SSE41-LABEL: nearbyint_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $12, %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $12, %xmm0, %xmm0
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
ret <4 x float> %t
}
declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
-define <4 x double> @nearbyint_v4f64(<4 x double> %p)
-{
- ; CHECK: nearbyint_v4f64
- ; CHECK: vroundpd
+define <4 x double> @nearbyint_v4f64(<4 x double> %p) {
+; SSE41-LABEL: nearbyint_v4f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v4f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
ret <4 x double> %t
}
declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
-define <8 x float> @nearbyint_v8f32(<8 x float> %p)
-{
- ; CHECK: nearbyint_v8f32
- ; CHECK: vroundps
+define <8 x float> @nearbyint_v8f32(<8 x float> %p) {
+; SSE41-LABEL: nearbyint_v8f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: roundps $12, %xmm0, %xmm0
+; SSE41-NEXT: roundps $12, %xmm1, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v8f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vroundps $12, %ymm0, %ymm0
+; AVX-NEXT: retq
%t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
ret <8 x float> %t
}
@@ -186,43 +317,85 @@ declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
;
define <2 x double> @const_floor_v2f64() {
- ; CHECK: const_floor_v2f64
- ; CHECK: movaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_floor_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_floor_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_floor_v4f32() {
- ; CHECK: const_floor_v4f32
- ; CHECK: movaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_floor_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_floor_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
define <2 x double> @const_ceil_v2f64() {
- ; CHECK: const_ceil_v2f64
- ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+; SSE41-LABEL: const_ceil_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_ceil_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_ceil_v4f32() {
- ; CHECK: const_ceil_v4f32
- ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+; SSE41-LABEL: const_ceil_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_ceil_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
define <2 x double> @const_trunc_v2f64() {
- ; CHECK: const_trunc_v2f64
- ; CHECK: movaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_trunc_v2f64:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_trunc_v2f64:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_trunc_v4f32() {
- ; CHECK: const_trunc_v4f32
- ; CHECK: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-LABEL: const_trunc_v4f32:
+; SSE41: ## BB#0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: const_trunc_v4f32:
+; AVX: ## BB#0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; AVX-NEXT: retq
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index a85ae984d8e6..78799ff04fe1 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -1,25 +1,43 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE2
; FNEG is defined as subtraction from -0.0.
; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed.
-define <4 x float> @t1(<4 x float> %Q) {
-; CHECK-LABEL: t1:
-; CHECK: xorps {{.*}}LCPI0_0{{.*}}, %xmm0
-; CHECK-NEXT: retq
- %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
- ret <4 x float> %tmp
+define <4 x float> @t1(<4 x float> %Q) nounwind {
+; X32-SSE-LABEL: t1:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps .LCPI0_0, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: t1:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: retq
+ %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
+ ret <4 x float> %tmp
}
; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg.
-define <4 x float> @t2(<4 x float> %Q) {
-; CHECK-LABEL: t2:
-; CHECK: xorps %[[X:xmm[0-9]+]], %[[X]]
-; CHECK-NEXT: subps %xmm0, %[[X]]
-; CHECK-NEXT: movaps %[[X]], %xmm0
-; CHECK-NEXT: retq
- %tmp = fsub <4 x float> zeroinitializer, %Q
- ret <4 x float> %tmp
+define <4 x float> @t2(<4 x float> %Q) nounwind {
+; X32-SSE-LABEL: t2:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm1
+; X32-SSE-NEXT: subps %xmm0, %xmm1
+; X32-SSE-NEXT: movaps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: t2:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm1
+; X64-SSE-NEXT: subps %xmm0, %xmm1
+; X64-SSE-NEXT: movaps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %tmp = fsub <4 x float> zeroinitializer, %Q
+ ret <4 x float> %tmp
}
; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely.
@@ -31,14 +49,51 @@ define <4 x float> @t2(<4 x float> %Q) {
; We should generate:
; movabsq (put sign bit mask in integer register))
; xorq (flip sign bits)
-; movd (move to xmm return register)
+; movd (move to xmm return register)
-define <2 x float> @fneg_bitcast(i64 %i) {
-; CHECK-LABEL: fneg_bitcast:
-; CHECK: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
-; CHECK-NEXT: xorq %rdi, %rax
-; CHECK-NEXT: movd %rax, %xmm0
-; CHECK-NEXT: retq
+define <2 x float> @fneg_bitcast(i64 %i) nounwind {
+; X32-SSE1-LABEL: fneg_bitcast:
+; X32-SSE1: # BB#0:
+; X32-SSE1-NEXT: pushl %ebp
+; X32-SSE1-NEXT: movl %esp, %ebp
+; X32-SSE1-NEXT: andl $-16, %esp
+; X32-SSE1-NEXT: subl $32, %esp
+; X32-SSE1-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X32-SSE1-NEXT: movl 12(%ebp), %ecx
+; X32-SSE1-NEXT: xorl %eax, %ecx
+; X32-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X32-SSE1-NEXT: xorl 8(%ebp), %eax
+; X32-SSE1-NEXT: movl %eax, (%esp)
+; X32-SSE1-NEXT: movaps (%esp), %xmm0
+; X32-SSE1-NEXT: movl %ebp, %esp
+; X32-SSE1-NEXT: popl %ebp
+; X32-SSE1-NEXT: retl
+;
+; X32-SSE2-LABEL: fneg_bitcast:
+; X32-SSE2: # BB#0:
+; X32-SSE2-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT: xorl %eax, %ecx
+; X32-SSE2-NEXT: movd %ecx, %xmm1
+; X32-SSE2-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT: movd %eax, %xmm0
+; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT: retl
+;
+; X64-SSE1-LABEL: fneg_bitcast:
+; X64-SSE1: # BB#0:
+; X64-SSE1-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
+; X64-SSE1-NEXT: xorq %rdi, %rax
+; X64-SSE1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE1-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE1-NEXT: retq
+;
+; X64-SSE2-LABEL: fneg_bitcast:
+; X64-SSE2: # BB#0:
+; X64-SSE2-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
+; X64-SSE2-NEXT: xorq %rdi, %rax
+; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: retq
%bitcast = bitcast i64 %i to <2 x float>
%fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
ret <2 x float> %fneg
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 7834b2804247..0ad5ef7ee8f5 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -81,6 +81,7 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
;
; AVX-LABEL: fptosi_4f64_to_2i32:
; AVX: # BB#0:
+; AVX-NEXT: # kill
; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index bb5409b91ee4..5f14324958a2 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,13 +1,39 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1,-avx | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
; PR11674
define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
-; CHECK-LABEL: fpext_frommem:
-; AVX-LABEL: fpext_frommem:
+; X32-SSE-LABEL: fpext_frommem:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0
+; X32-SSE-NEXT: movups %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_frommem:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtps2pd (%ecx), %xmm0
+; X32-AVX-NEXT: vmovups %xmm0, (%eax)
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_frommem:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0
+; X64-SSE-NEXT: movups %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_frommem:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtps2pd (%rdi), %xmm0
+; X64-AVX-NEXT: vmovups %xmm0, (%rsi)
+; X64-AVX-NEXT: retq
entry:
-; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
%0 = load <2 x float>, <2 x float>* %in, align 8
%1 = fpext <2 x float> %0 to <2 x double>
store <2 x double> %1, <2 x double>* %out, align 1
@@ -15,12 +41,40 @@ entry:
}
define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
-; CHECK-LABEL: fpext_frommem4:
-; AVX-LABEL: fpext_frommem4:
+; X32-SSE-LABEL: fpext_frommem4:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0
+; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1
+; X32-SSE-NEXT: movups %xmm1, 16(%eax)
+; X32-SSE-NEXT: movups %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_frommem4:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtps2pd (%ecx), %ymm0
+; X32-AVX-NEXT: vmovups %ymm0, (%eax)
+; X32-AVX-NEXT: vzeroupper
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_frommem4:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0
+; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1
+; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT: movups %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_frommem4:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtps2pd (%rdi), %ymm0
+; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
entry:
-; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
%0 = load <4 x float>, <4 x float>* %in
%1 = fpext <4 x float> %0 to <4 x double>
store <4 x double> %1, <4 x double>* %out, align 1
@@ -28,15 +82,52 @@ entry:
}
define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
-; CHECK-LABEL: fpext_frommem8:
-; AVX-LABEL: fpext_frommem8:
+; X32-SSE-LABEL: fpext_frommem8:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0
+; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1
+; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm2
+; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm3
+; X32-SSE-NEXT: movups %xmm3, 48(%eax)
+; X32-SSE-NEXT: movups %xmm2, 32(%eax)
+; X32-SSE-NEXT: movups %xmm1, 16(%eax)
+; X32-SSE-NEXT: movups %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_frommem8:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtps2pd (%ecx), %ymm0
+; X32-AVX-NEXT: vcvtps2pd 16(%ecx), %ymm1
+; X32-AVX-NEXT: vmovups %ymm1, 32(%eax)
+; X32-AVX-NEXT: vmovups %ymm0, (%eax)
+; X32-AVX-NEXT: vzeroupper
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_frommem8:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0
+; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1
+; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2
+; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm3
+; X64-SSE-NEXT: movups %xmm3, 48(%rsi)
+; X64-SSE-NEXT: movups %xmm2, 32(%rsi)
+; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT: movups %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_frommem8:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtps2pd (%rdi), %ymm0
+; X64-AVX-NEXT: vcvtps2pd 16(%rdi), %ymm1
+; X64-AVX-NEXT: vmovups %ymm1, 32(%rsi)
+; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
entry:
-; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
-; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
%0 = load <8 x float>, <8 x float>* %in
%1 = fpext <8 x float> %0 to <8 x double>
store <8 x double> %1, <8 x double>* %out, align 1
@@ -44,11 +135,26 @@ entry:
}
define <2 x double> @fpext_fromconst() {
-; CHECK-LABEL: fpext_fromconst:
-; AVX-LABEL: fpext_fromconst:
+; X32-SSE-LABEL: fpext_fromconst:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fpext_fromconst:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fpext_fromconst:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fpext_fromconst:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
+; X64-AVX-NEXT: retq
entry:
-; CHECK: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
-; AVX: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
%0 = insertelement <2 x float> undef, float 1.0, i32 0
%1 = insertelement <2 x float> %0, float -2.0, i32 1
%2 = fpext <2 x float> %1 to <2 x double>
diff --git a/test/CodeGen/X86/vec_fptrunc.ll b/test/CodeGen/X86/vec_fptrunc.ll
new file mode 100644
index 000000000000..fa22a4af1755
--- /dev/null
+++ b/test/CodeGen/X86/vec_fptrunc.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
+
+define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
+; X32-SSE-LABEL: fptrunc_frommem2:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm0
+; X32-SSE-NEXT: extractps $1, %xmm0, 4(%eax)
+; X32-SSE-NEXT: movss %xmm0, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_frommem2:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtpd2psx (%ecx), %xmm0
+; X32-AVX-NEXT: vextractps $1, %xmm0, 4(%eax)
+; X32-AVX-NEXT: vmovss %xmm0, (%eax)
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_frommem2:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm0
+; X64-SSE-NEXT: movlpd %xmm0, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_frommem2:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psx (%rdi), %xmm0
+; X64-AVX-NEXT: vmovlpd %xmm0, (%rsi)
+; X64-AVX-NEXT: retq
+entry:
+ %0 = load <2 x double>, <2 x double>* %in
+ %1 = fptrunc <2 x double> %0 to <2 x float>
+ store <2 x float> %1, <2 x float>* %out, align 1
+ ret void
+}
+
+define void @fptrunc_frommem4(<4 x double>* %in, <4 x float>* %out) {
+; X32-SSE-LABEL: fptrunc_frommem4:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0
+; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm1
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-SSE-NEXT: movupd %xmm1, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_frommem4:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0
+; X32-AVX-NEXT: vmovupd %xmm0, (%eax)
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_frommem4:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps 16(%rdi), %xmm0
+; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm1
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-SSE-NEXT: movupd %xmm1, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_frommem4:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psy (%rdi), %xmm0
+; X64-AVX-NEXT: vmovupd %xmm0, (%rsi)
+; X64-AVX-NEXT: retq
+entry:
+ %0 = load <4 x double>, <4 x double>* %in
+ %1 = fptrunc <4 x double> %0 to <4 x float>
+ store <4 x float> %1, <4 x float>* %out, align 1
+ ret void
+}
+
+define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
+; X32-SSE-LABEL: fptrunc_frommem8:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0
+; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm1
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-SSE-NEXT: cvtpd2ps 48(%ecx), %xmm0
+; X32-SSE-NEXT: cvtpd2ps 32(%ecx), %xmm2
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X32-SSE-NEXT: movupd %xmm2, 16(%eax)
+; X32-SSE-NEXT: movupd %xmm1, (%eax)
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_frommem8:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0
+; X32-AVX-NEXT: vcvtpd2psy 32(%ecx), %xmm1
+; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovupd %ymm0, (%eax)
+; X32-AVX-NEXT: vzeroupper
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_frommem8:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps 16(%rdi), %xmm0
+; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm1
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-SSE-NEXT: cvtpd2ps 48(%rdi), %xmm0
+; X64-SSE-NEXT: cvtpd2ps 32(%rdi), %xmm2
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X64-SSE-NEXT: movupd %xmm2, 16(%rsi)
+; X64-SSE-NEXT: movupd %xmm1, (%rsi)
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_frommem8:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psy (%rdi), %xmm0
+; X64-AVX-NEXT: vcvtpd2psy 32(%rdi), %xmm1
+; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovupd %ymm0, (%rsi)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
+entry:
+ %0 = load <8 x double>, <8 x double>* %in
+ %1 = fptrunc <8 x double> %0 to <8 x float>
+ store <8 x float> %1, <8 x float>* %out, align 1
+ ret void
+}
+
+; FIXME: For exact truncations we should be able to fold this.
+define <4 x float> @fptrunc_fromconst() {
+; X32-SSE-LABEL: fptrunc_fromconst:
+; X32-SSE: # BB#0: # %entry
+; X32-SSE-NEXT: cvtpd2ps .LCPI3_0, %xmm1
+; X32-SSE-NEXT: cvtpd2ps .LCPI3_1, %xmm0
+; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
+;
+; X32-AVX-LABEL: fptrunc_fromconst:
+; X32-AVX: # BB#0: # %entry
+; X32-AVX-NEXT: vcvtpd2psy .LCPI3_0, %xmm0
+; X32-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: fptrunc_fromconst:
+; X64-SSE: # BB#0: # %entry
+; X64-SSE-NEXT: cvtpd2ps {{.*}}(%rip), %xmm1
+; X64-SSE-NEXT: cvtpd2ps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: fptrunc_fromconst:
+; X64-AVX: # BB#0: # %entry
+; X64-AVX-NEXT: vcvtpd2psy {{.*}}(%rip), %xmm0
+; X64-AVX-NEXT: retq
+entry:
+ %0 = insertelement <4 x double> undef, double 1.0, i32 0
+ %1 = insertelement <4 x double> %0, double -2.0, i32 1
+ %2 = insertelement <4 x double> %1, double +4.0, i32 2
+ %3 = insertelement <4 x double> %2, double -0.0, i32 3
+ %4 = fptrunc <4 x double> %3 to <4 x float>
+ ret <4 x float> %4
+}
diff --git a/test/CodeGen/X86/vec_i64.ll b/test/CodeGen/X86/vec_i64.ll
index 48ca1ff021d9..e468839ddc23 100644
--- a/test/CodeGen/X86/vec_i64.ll
+++ b/test/CodeGen/X86/vec_i64.ll
@@ -1,22 +1,43 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep movq %t | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; Used movq to load i64 into a v2i64 when the top i64 is 0.
define <2 x i64> @foo1(i64* %y) nounwind {
+; X32-LABEL: foo1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: foo1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
entry:
- %tmp1 = load i64, i64* %y, align 8 ; <i64> [#uses=1]
- %s2v = insertelement <2 x i64> undef, i64 %tmp1, i32 0
- %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
- ret <2 x i64> %loadl
+ %tmp1 = load i64, i64* %y, align 8
+ %s2v = insertelement <2 x i64> undef, i64 %tmp1, i32 0
+ %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
+ ret <2 x i64> %loadl
}
define <4 x float> @foo2(i64* %p) nounwind {
+; X32-LABEL: foo2:
+; X32: # BB#0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: foo2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: retq
entry:
- %load = load i64, i64* %p
- %s2v = insertelement <2 x i64> undef, i64 %load, i32 0
- %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
- %0 = bitcast <2 x i64> %loadl to <4 x float>
- ret <4 x float> %0
+ %load = load i64, i64* %p
+ %s2v = insertelement <2 x i64> undef, i64 %load, i32 0
+ %loadl = shufflevector <2 x i64> zeroinitializer, <2 x i64> %s2v, <2 x i32> <i32 2, i32 1>
+ %0 = bitcast <2 x i64> %loadl to <4 x float>
+ ret <4 x float> %0
}
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 565be7a6cc70..8019e11ad4c0 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -1,24 +1,109 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep "(%esp,%eax,4)" | count 4
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
; Inserts and extracts with variable indices must be lowered
; to memory accesses.
define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t0:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl $76, (%esp,%eax,4)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t0:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl $76, -24(%rsp,%rax,4)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: retq
%t13 = insertelement <4 x i32> %t8, i32 76, i32 %t7
%t9 = extractelement <4 x i32> %t13, i32 0
ret i32 %t9
}
+
define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl $76, %ecx
+; X32-NEXT: pinsrd $0, %ecx, %xmm0
+; X32-NEXT: movdqa %xmm0, (%esp)
+; X32-NEXT: movl (%esp,%eax,4), %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movl $76, %eax
+; X64-NEXT: pinsrd $0, %eax, %xmm0
+; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl -24(%rsp,%rax,4), %eax
+; X64-NEXT: retq
%t13 = insertelement <4 x i32> %t8, i32 76, i32 0
%t9 = extractelement <4 x i32> %t13, i32 %t7
ret i32 %t9
}
+
define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movdqa %xmm0, (%esp)
+; X32-NEXT: pinsrd $0, (%esp,%eax,4), %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: pinsrd $0, -24(%rsp,%rax,4), %xmm0
+; X64-NEXT: retq
%t9 = extractelement <4 x i32> %t8, i32 %t7
%t13 = insertelement <4 x i32> %t8, i32 %t9, i32 0
ret <4 x i32> %t13
}
+
define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
+; X32-LABEL: t3:
+; X32: # BB#0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movd %xmm0, (%esp,%eax,4)
+; X32-NEXT: movaps (%esp), %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0:
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movd %xmm0, -24(%rsp,%rax,4)
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: retq
%t9 = extractelement <4 x i32> %t8, i32 0
%t13 = insertelement <4 x i32> %t8, i32 %t9, i32 %t7
ret <4 x i32> %t13
diff --git a/test/CodeGen/X86/vec_ins_extract.ll b/test/CodeGen/X86/vec_ins_extract.ll
index e92f46dbabb5..5ff49eff6df3 100644
--- a/test/CodeGen/X86/vec_ins_extract.ll
+++ b/test/CodeGen/X86/vec_ins_extract.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -scalarrepl -instcombine | \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: opt < %s -sroa -instcombine | \
; RUN: llc -march=x86 -mcpu=yonah | not grep sub.*esp
; This checks that various insert/extract idiom work without going to the
diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll
index fe20a474f59a..2e6654185de8 100644
--- a/test/CodeGen/X86/vec_insert-2.ll
+++ b/test/CodeGen/X86/vec_insert-2.ll
@@ -1,42 +1,68 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X64 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
; X32-LABEL: t1:
-; X32: shufps $36
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = insertelement <4 x float> %tmp, float %s, i32 3
ret <4 x float> %tmp1
}
define <4 x i32> @t2(i32 %s, <4 x i32> %tmp) nounwind {
; X32-LABEL: t2:
-; X32: shufps $36
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X64-NEXT: retq
%tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 3
ret <4 x i32> %tmp1
}
define <2 x double> @t3(double %s, <2 x double> %tmp) nounwind {
; X32-LABEL: t3:
-; X32: movhpd
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32-NEXT: retl
+;
; X64-LABEL: t3:
-; X64: unpcklpd
-; X64: ret
-
+; X64: # BB#0:
+; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = insertelement <2 x double> %tmp, double %s, i32 1
ret <2 x double> %tmp1
}
define <8 x i16> @t4(i16 %s, <8 x i16> %tmp) nounwind {
; X32-LABEL: t4:
-; X32: pinsrw
-; X32: ret
-
+; X32: # BB#0:
+; X32-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t4:
+; X64: # BB#0:
+; X64-NEXT: pinsrw $5, %edi, %xmm0
+; X64-NEXT: retq
%tmp1 = insertelement <8 x i16> %tmp, i16 %s, i32 5
ret <8 x i16> %tmp1
}
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll
index 75244ae0b71a..57a265a0ce30 100644
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -1,10 +1,23 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
-; CHECK-LABEL: t1:
-; CHECK: punpcklqdq
-; CHECK-NEXT: retq
-
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm1
+; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: retq
%tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
ret <2 x i64> %tmp1
}
diff --git a/test/CodeGen/X86/vec_insert-4.ll b/test/CodeGen/X86/vec_insert-4.ll
index 2c31e56b4af6..c847ac983003 100644
--- a/test/CodeGen/X86/vec_insert-4.ll
+++ b/test/CodeGen/X86/vec_insert-4.ll
@@ -1,11 +1,40 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep 1084227584 | count 1
-
-; ModuleID = '<stdin>'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i686-apple-darwin9.2.2"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9.2.2 -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9.2.2 -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64
define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
+; X32-LABEL: f:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-32, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl $1084227584, (%esp,%eax,4) ## imm = 0x40A00000
+; X32-NEXT: movaps (%esp), %xmm0
+; X32-NEXT: movaps {{[0-9]+}}(%esp), %xmm1
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: f:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: pushq %rbp
+; X64-NEXT: movq %rsp, %rbp
+; X64-NEXT: andq $-32, %rsp
+; X64-NEXT: subq $64, %rsp
+; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; X64-NEXT: movaps %xmm0, (%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl $1084227584, (%rsp,%rax,4) ## imm = 0x40A00000
+; X64-NEXT: movaps (%rsp), %xmm0
+; X64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; X64-NEXT: movq %rbp, %rsp
+; X64-NEXT: popq %rbp
+; X64-NEXT: retq
entry:
- %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b ; <<4 x float>> [#uses=1]
- ret <8 x float> %vecins
+ %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
+ ret <8 x float> %vecins
}
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 14b57e76dc8f..67875b3ef23e 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -1,17 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86 -mattr=+sse2,+ssse3 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+ssse3 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3 | FileCheck %s --check-prefix=X64
+
; There are no MMX operations in @t1
define void @t1(i32 %a, x86_mmx* %P) nounwind {
-; CHECK-LABEL: t1:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: shll $12, %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; CHECK-NEXT: movq %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: shll $12, %ecx
+; X32-NEXT: movd %ecx, %xmm0
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X32-NEXT: movq %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: shll $12, %edi
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rsi)
+; X64-NEXT: retq
%tmp12 = shl i32 %a, 12
%tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
%tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
@@ -21,87 +33,135 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
}
define <4 x float> @t2(<4 x float>* %P) nounwind {
-; CHECK-LABEL: t2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movaps (%eax), %xmm1
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
-; CHECK-NEXT: retl
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm1
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm1
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
ret <4 x float> %tmp2
}
define <4 x float> @t3(<4 x float>* %P) nounwind {
-; CHECK-LABEL: t3:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movapd (%eax), %xmm0
-; CHECK-NEXT: xorpd %xmm1, %xmm1
-; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; CHECK-NEXT: retl
+; X32-LABEL: t3:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movapd (%eax), %xmm0
+; X32-NEXT: xorpd %xmm1, %xmm1
+; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0:
+; X64-NEXT: movapd (%rdi), %xmm0
+; X64-NEXT: xorpd %xmm1, %xmm1
+; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
ret <4 x float> %tmp2
}
define <4 x float> @t4(<4 x float>* %P) nounwind {
-; CHECK-LABEL: t4:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movaps (%eax), %xmm0
-; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; CHECK-NEXT: retl
+; X32-LABEL: t4:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movaps (%eax), %xmm0
+; X32-NEXT: xorps %xmm1, %xmm1
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: t4:
+; X64: # BB#0:
+; X64-NEXT: movaps (%rdi), %xmm0
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; X64-NEXT: retq
%tmp1 = load <4 x float>, <4 x float>* %P
%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
ret <4 x float> %tmp2
}
define <16 x i8> @t5(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t5:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrlw $8, %xmm0
-; CHECK-NEXT: retl
+; X32-LABEL: t5:
+; X32: # BB#0:
+; X32-NEXT: psrlw $8, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t5:
+; X64: # BB#0:
+; X64-NEXT: psrlw $8, %xmm0
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
ret <16 x i8> %s
}
define <16 x i8> @t6(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t6:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrlw $8, %xmm0
-; CHECK-NEXT: retl
+; X32-LABEL: t6:
+; X32: # BB#0:
+; X32-NEXT: psrlw $8, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t6:
+; X64: # BB#0:
+; X64-NEXT: psrlw $8, %xmm0
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i8> %s
}
define <16 x i8> @t7(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t7:
-; CHECK: # BB#0:
-; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; CHECK-NEXT: retl
+; X32-LABEL: t7:
+; X32: # BB#0:
+; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X32-NEXT: retl
+;
+; X64-LABEL: t7:
+; X64: # BB#0:
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
ret <16 x i8> %s
}
define <16 x i8> @t8(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t8:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
-; CHECK-NEXT: retl
+; X32-LABEL: t8:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t8:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
ret <16 x i8> %s
}
define <16 x i8> @t9(<16 x i8> %x) nounwind {
-; CHECK-LABEL: t9:
-; CHECK: # BB#0:
-; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
-; CHECK-NEXT: retl
+; X32-LABEL: t9:
+; X32: # BB#0:
+; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t9:
+; X64: # BB#0:
+; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
ret <16 x i8> %s
}
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 4f72c66ecba2..02db6e6d8751 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -1,29 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=i686-apple-darwin9 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+mmx,+sse4.2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+mmx,+sse4.2 | FileCheck %s --check-prefix=X64
; MMX insertelement is not available; these are promoted to XMM.
; (Without SSE they are split to two ints, and the code is much better.)
define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
-; CHECK-LABEL: mmx_movzl:
-; CHECK: ## BB#0:
-; CHECK-NEXT: subl $20, %esp
-; CHECK-NEXT: movq %mm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: movl $32, %eax
-; CHECK-NEXT: pinsrd $0, %eax, %xmm0
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: pinsrd $1, %eax, %xmm0
-; CHECK-NEXT: pinsrd $2, %eax, %xmm0
-; CHECK-NEXT: pinsrd $3, %eax, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: movq %xmm0, (%esp)
-; CHECK-NEXT: movq (%esp), %mm0
-; CHECK-NEXT: addl $20, %esp
-; CHECK-NEXT: retl
+; X32-LABEL: mmx_movzl:
+; X32: ## BB#0:
+; X32-NEXT: subl $20, %esp
+; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
+; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; X32-NEXT: movl $32, %eax
+; X32-NEXT: pinsrd $0, %eax, %xmm0
+; X32-NEXT: pxor %xmm1, %xmm1
+; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; X32-NEXT: movq %xmm1, (%esp)
+; X32-NEXT: movq (%esp), %mm0
+; X32-NEXT: addl $20, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: mmx_movzl:
+; X64: ## BB#0:
+; X64-NEXT: movdq2q %xmm0, %mm0
+; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; X64-NEXT: movl $32, %eax
+; X64-NEXT: pinsrq $0, %rax, %xmm1
+; X64-NEXT: pxor %xmm0, %xmm0
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; X64-NEXT: retq
%tmp = bitcast x86_mmx %x to <2 x i32>
- %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0 ; <<2 x i32>> [#uses=1]
- %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1 ; <<2 x i32>> [#uses=1]
+ %tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
+ %tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1
%tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
ret x86_mmx %tmp9
}
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index 917832c40adb..d612e7eb10d3 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -1,15 +1,58 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1 -o %t
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
; tests variable insert and extract of a 4 x i32
-define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
+define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
+; X32-LABEL: var_insert:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 12(%ebp), %ecx
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl %eax, (%esp,%ecx,4)
+; X32-NEXT: movaps (%esp), %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: var_insert:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %esi, %rax
+; X64-NEXT: movl %edi, -24(%rsp,%rax,4)
+; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT: retq
entry:
- %tmp3 = insertelement <4 x i32> %x, i32 %val, i32 %idx ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %tmp3
+ %tmp3 = insertelement <4 x i32> %x, i32 %val, i32 %idx
+ ret <4 x i32> %tmp3
}
-define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
+define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
+; X32-LABEL: var_extract:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $32, %esp
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movaps %xmm0, (%esp)
+; X32-NEXT: movl (%esp,%eax,4), %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: var_extract:
+; X64: # BB#0: # %entry
+; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: movl -24(%rsp,%rax,4), %eax
+; X64-NEXT: retq
entry:
- %tmp3 = extractelement <4 x i32> %x, i32 %idx ; <<i32>> [#uses=1]
- ret i32 %tmp3
+ %tmp3 = extractelement <4 x i32> %x, i32 %idx
+ ret i32 %tmp3
}
diff --git a/test/CodeGen/X86/vec_insert-9.ll b/test/CodeGen/X86/vec_insert-9.ll
index 5f2e676ef1ae..ec4a0288e107 100644
--- a/test/CodeGen/X86/vec_insert-9.ll
+++ b/test/CodeGen/X86/vec_insert-9.ll
@@ -1,9 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1 > %t
-; RUN: grep pinsrd %t | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
+; X32-LABEL: var_insert2:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: var_insert2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: pinsrd $3, %esi, %xmm0
+; X64-NEXT: retq
entry:
- %tmp3 = insertelement <4 x i32> undef, i32 %val, i32 0 ; <<4 x i32>> [#uses=1]
- %tmp4 = insertelement <4 x i32> %tmp3, i32 %idx, i32 3 ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %tmp4
+ %tmp3 = insertelement <4 x i32> undef, i32 %val, i32 0
+ %tmp4 = insertelement <4 x i32> %tmp3, i32 %idx, i32 3
+ ret <4 x i32> %tmp4
}
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
index cbd420885ac1..2aae35591ab2 100644
--- a/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -1,37 +1,56 @@
-; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s -check-prefix=X86-64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64
; This is not an MMX operation; promoted to XMM.
define x86_mmx @t0(i32 %A) nounwind {
-; X86-32-LABEL: t0:
-; X86-32: ## BB#0:
-; X86-32: movd {{[0-9]+}}(%esp), %xmm0
-; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; X86-32-NEXT: movq %xmm0, (%esp)
-; X86-32-NEXT: movq (%esp), %mm0
-; X86-32-NEXT: addl $12, %esp
-; X86-32-NEXT: retl
+; X32-LABEL: t0:
+; X32: ## BB#0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X32-NEXT: movq %xmm0, (%esp)
+; X32-NEXT: movq (%esp), %mm0
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: t0:
+; X64: ## BB#0:
+; X64-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: retq
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
%tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
ret x86_mmx %tmp4
}
define <8 x i8> @t1(i8 zeroext %x) nounwind {
-; X86-32-LABEL: t1:
-; X86-32: ## BB#0:
-; X86-32-NOT: movl
-; X86-32-NEXT: movd {{[0-9]+}}(%esp), %xmm0
-; X86-32-NEXT: retl
+; X32-LABEL: t1:
+; X32: ## BB#0:
+; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: ## BB#0:
+; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: retq
%r = insertelement <8 x i8> undef, i8 %x, i32 0
ret <8 x i8> %r
}
; PR2574
define <2 x float> @t2(<2 x float> %a0) {
-; X86-32-LABEL: t2:
-; X86-32: ## BB#0:
-; X86-32-NEXT: xorps %xmm0, %xmm0
-; X86-32-NEXT: retl
+; X32-LABEL: t2:
+; X32: ## BB#0:
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: ## BB#0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: retq
%v1 = insertelement <2 x float> %a0, float 0.000000e+00, i32 0
%v2 = insertelement <2 x float> %v1, float 0.000000e+00, i32 1
ret <2 x float> %v2
@@ -42,14 +61,31 @@ define <2 x float> @t2(<2 x float> %a0) {
; PR2562
define void @t3() {
-; X86-64-LABEL: t3:
-; X86-64: ## BB#0:
-; X86-64: pmovzxwd (%rcx)
-; X86-64-NEXT: movzwl
-; X86-64-NEXT: pinsrd $0
-; X86-64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X86-64-NEXT: movq %xmm0
-; X86-64-NEXT: retq
+; X32-LABEL: t3:
+; X32: ## BB#0:
+; X32-NEXT: movl L_g0$non_lazy_ptr, %eax
+; X32-NEXT: movl L_g1$non_lazy_ptr, %ecx
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X32-NEXT: movzwl (%eax), %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT: movq %xmm0, (%ecx)
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: ## BB#0:
+; X64-NEXT: movq _g0@{{.*}}(%rip), %rax
+; X64-NEXT: movq _g1@{{.*}}(%rip), %rcx
+; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; X64-NEXT: movzwl (%rax), %eax
+; X64-NEXT: pinsrd $0, %eax, %xmm0
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; X64-NEXT: movq %xmm0, (%rcx)
+; X64-NEXT: retq
load i16, i16* @g0
load <4 x i16>, <4 x i16>* @g1
insertelement <4 x i16> %2, i16 %1, i32 0
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index fd98791815e7..43f5318a6070 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -61,6 +61,7 @@ define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
; AVX-LABEL: sitofp_4i32_to_2f64:
; AVX: # BB#0:
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: # kill
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x double>
@@ -98,6 +99,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -105,6 +107,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x double>
@@ -144,6 +147,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -152,6 +156,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x double>
@@ -432,6 +437,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -445,6 +451,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x double>
@@ -482,6 +489,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -489,6 +497,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x double>
@@ -528,6 +537,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -536,6 +546,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x double>
@@ -890,6 +901,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -897,6 +909,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x float>
@@ -939,6 +952,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -947,6 +961,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x float>
@@ -1085,9 +1100,7 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
;
; AVX2-LABEL: sitofp_8i8_to_8f32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1386,6 +1399,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1393,6 +1407,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x float>
@@ -1430,12 +1445,12 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
;
; AVX1-LABEL: uitofp_16i8_to_4f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: # kill
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1444,6 +1459,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: # kill
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x float>
@@ -1583,6 +1599,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: .LBB45_10:
; AVX1-NEXT: shrq %rax
; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
@@ -1650,6 +1667,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: .LBB45_10:
; AVX2-NEXT: shrq %rax
; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
@@ -1754,20 +1772,16 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
;
; AVX1-LABEL: uitofp_8i8_to_8f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i8_to_8f32:
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
%shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1790,11 +1804,10 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
;
; AVX1-LABEL: uitofp_16i8_to_8f32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1810,6 +1823,1654 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
}
;
+; Load Signed Integer to Double
+;
+
+define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
+; SSE-LABEL: sitofp_load_2i64_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2sdq %rax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2sdq %rax, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i64_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %ld = load <2 x i64>, <2 x i64> *%a
+ %cvt = sitofp <2 x i64> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
+; SSE-LABEL: sitofp_load_2i32_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i32_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i32>, <2 x i32> *%a
+ %cvt = sitofp <2 x i32> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
+; SSE-LABEL: sitofp_load_2i16_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i16_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i16>, <2 x i16> *%a
+ %cvt = sitofp <2 x i16> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
+; SSE-LABEL: sitofp_load_2i8_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl (%rdi), %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_2i8_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i8>, <2 x i8> *%a
+ %cvt = sitofp <2 x i8> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
+; SSE-LABEL: sitofp_load_4i64_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2sdq %rax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2sdq %rax, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2sdq %rax, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2sdq %rax, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_4i64_to_4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_4i64_to_4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = sitofp <4 x i64> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
+; SSE-LABEL: sitofp_load_4i32_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i32_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = sitofp <4 x i32> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
+; SSE-LABEL: sitofp_load_4i16_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i16_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = sitofp <4 x i16> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
+; SSE-LABEL: sitofp_load_4i8_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i8_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = sitofp <4 x i8> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+;
+; Load Unsigned Integer to Double
+;
+
+define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
+; SSE-LABEL: uitofp_load_2i64_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: subpd %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i64_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
+; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT: retq
+ %ld = load <2 x i64>, <2 x i64> *%a
+ %cvt = uitofp <2 x i64> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
+; SSE-LABEL: uitofp_load_2i32_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: subpd %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i32_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
+; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT: retq
+ %ld = load <2 x i32>, <2 x i32> *%a
+ %cvt = uitofp <2 x i32> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
+; SSE-LABEL: uitofp_load_2i16_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i16_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i16>, <2 x i16> *%a
+ %cvt = uitofp <2 x i16> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
+; SSE-LABEL: uitofp_load_2i8_to_2f64:
+; SSE: # BB#0:
+; SSE-NEXT: movzwl (%rdi), %eax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_2i8_to_2f64:
+; AVX: # BB#0:
+; AVX-NEXT: movzwl (%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x i8>, <2 x i8> *%a
+ %cvt = uitofp <2 x i8> %ld to <2 x double>
+ ret <2 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
+; SSE-LABEL: uitofp_load_4i64_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm5, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT: addpd %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: subpd %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
+; SSE-NEXT: addpd %xmm4, %xmm1
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: subpd %xmm5, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT: addpd %xmm2, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: subpd %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; SSE-NEXT: addpd %xmm4, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i64_to_4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = uitofp <4 x i64> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
+; SSE-LABEL: uitofp_load_4i32_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25]
+; SSE-NEXT: subpd %xmm6, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: subpd %xmm6, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
+; SSE-NEXT: addpd %xmm5, %xmm3
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT: subpd %xmm6, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
+; SSE-NEXT: addpd %xmm2, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: subpd %xmm6, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
+; SSE-NEXT: addpd %xmm3, %xmm2
+; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i32_to_4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i32_to_4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = uitofp <4 x i32> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
+; SSE-LABEL: uitofp_load_4i16_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i16_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = uitofp <4 x i16> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
+; SSE-LABEL: uitofp_load_4i8_to_4f64:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i8_to_4f64:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = uitofp <4 x i8> %ld to <4 x double>
+ ret <4 x double> %cvt
+}
+
+;
+; Load Signed Integer to Float
+;
+
+define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
+; SSE-LABEL: sitofp_load_4i64_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_4i64_to_4f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_4i64_to_4f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = sitofp <4 x i64> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
+; SSE-LABEL: sitofp_load_4i32_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i32_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = sitofp <4 x i32> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
+; SSE-LABEL: sitofp_load_4i16_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i16_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = sitofp <4 x i16> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
+; SSE-LABEL: sitofp_load_4i8_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_4i8_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = sitofp <4 x i8> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
+; SSE-LABEL: sitofp_load_8i64_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa 32(%rdi), %xmm3
+; SSE-NEXT: movdqa 48(%rdi), %xmm4
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm5
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movd %xmm4, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_8i64_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_8i64_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i64>, <8 x i64> *%a
+ %cvt = sitofp <8 x i64> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
+; SSE-LABEL: sitofp_load_8i32_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
+; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sitofp_load_8i32_to_8f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0
+; AVX-NEXT: retq
+ %ld = load <8 x i32>, <8 x i32> *%a
+ %cvt = sitofp <8 x i32> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
+; SSE-LABEL: sitofp_load_8i16_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i16>, <8 x i16> *%a
+ %cvt = sitofp <8 x i16> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
+; SSE-LABEL: sitofp_load_8i8_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sitofp_load_8i8_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sitofp_load_8i8_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i8>, <8 x i8> *%a
+ %cvt = sitofp <8 x i8> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+;
+; Load Unsigned Integer to Float
+;
+
+define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
+; SSE-LABEL: uitofp_load_4i64_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm3
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_1
+; SSE-NEXT: # BB#2:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: jmp .LBB74_3
+; SSE-NEXT: .LBB74_1:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm2
+; SSE-NEXT: .LBB74_3:
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_4
+; SSE-NEXT: # BB#5:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: jmp .LBB74_6
+; SSE-NEXT: .LBB74_4:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: addss %xmm0, %xmm0
+; SSE-NEXT: .LBB74_6:
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_7
+; SSE-NEXT: # BB#8:
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: jmp .LBB74_9
+; SSE-NEXT: .LBB74_7:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT: addss %xmm3, %xmm3
+; SSE-NEXT: .LBB74_9:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB74_10
+; SSE-NEXT: # BB#11:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: jmp .LBB74_12
+; SSE-NEXT: .LBB74_10:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: addss %xmm1, %xmm1
+; SSE-NEXT: .LBB74_12:
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i64_to_4f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_1
+; AVX1-NEXT: # BB#2:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: jmp .LBB74_3
+; AVX1-NEXT: .LBB74_1:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: .LBB74_3:
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_4
+; AVX1-NEXT: # BB#5:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB74_6
+; AVX1-NEXT: .LBB74_4:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB74_6:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_7
+; AVX1-NEXT: # BB#8:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB74_9
+; AVX1-NEXT: .LBB74_7:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB74_9:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB74_10
+; AVX1-NEXT: # BB#11:
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+; AVX1-NEXT: .LBB74_10:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_1
+; AVX2-NEXT: # BB#2:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: jmp .LBB74_3
+; AVX2-NEXT: .LBB74_1:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: .LBB74_3:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_4
+; AVX2-NEXT: # BB#5:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB74_6
+; AVX2-NEXT: .LBB74_4:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB74_6:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_7
+; AVX2-NEXT: # BB#8:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB74_9
+; AVX2-NEXT: .LBB74_7:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB74_9:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB74_10
+; AVX2-NEXT: # BB#11:
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB74_10:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %ld = load <4 x i64>, <4 x i64> *%a
+ %cvt = uitofp <4 x i64> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
+; SSE-LABEL: uitofp_load_4i32_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT: pand %xmm0, %xmm1
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: addps {{.*}}(%rip), %xmm0
+; SSE-NEXT: addps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_4i32_to_4f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_4i32_to_4f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+ %ld = load <4 x i32>, <4 x i32> *%a
+ %cvt = uitofp <4 x i32> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
+; SSE-LABEL: uitofp_load_4i16_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i16_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i16>, <4 x i16> *%a
+ %cvt = uitofp <4 x i16> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
+; SSE-LABEL: uitofp_load_4i8_to_4f32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: uitofp_load_4i8_to_4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x i8>, <4 x i8> *%a
+ %cvt = uitofp <4 x i8> %ld to <4 x float>
+ ret <4 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
+; SSE-LABEL: uitofp_load_8i64_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm5
+; SSE-NEXT: movdqa 32(%rdi), %xmm2
+; SSE-NEXT: movdqa 48(%rdi), %xmm3
+; SSE-NEXT: movd %xmm5, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_1
+; SSE-NEXT: # BB#2:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm4
+; SSE-NEXT: jmp .LBB78_3
+; SSE-NEXT: .LBB78_1:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm4
+; SSE-NEXT: addss %xmm4, %xmm4
+; SSE-NEXT: .LBB78_3:
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_4
+; SSE-NEXT: # BB#5:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE-NEXT: jmp .LBB78_6
+; SSE-NEXT: .LBB78_4:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
+; SSE-NEXT: addss %xmm0, %xmm0
+; SSE-NEXT: .LBB78_6:
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; SSE-NEXT: movd %xmm5, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_7
+; SSE-NEXT: # BB#8:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm6
+; SSE-NEXT: jmp .LBB78_9
+; SSE-NEXT: .LBB78_7:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm6
+; SSE-NEXT: addss %xmm6, %xmm6
+; SSE-NEXT: .LBB78_9:
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT: movd %xmm1, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_10
+; SSE-NEXT: # BB#11:
+; SSE-NEXT: xorps %xmm5, %xmm5
+; SSE-NEXT: cvtsi2ssq %rax, %xmm5
+; SSE-NEXT: jmp .LBB78_12
+; SSE-NEXT: .LBB78_10:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm5, %xmm5
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm5
+; SSE-NEXT: addss %xmm5, %xmm5
+; SSE-NEXT: .LBB78_12:
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_13
+; SSE-NEXT: # BB#14:
+; SSE-NEXT: cvtsi2ssq %rax, %xmm7
+; SSE-NEXT: jmp .LBB78_15
+; SSE-NEXT: .LBB78_13:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm7
+; SSE-NEXT: addss %xmm7, %xmm7
+; SSE-NEXT: .LBB78_15:
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_16
+; SSE-NEXT: # BB#17:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rax, %xmm1
+; SSE-NEXT: jmp .LBB78_18
+; SSE-NEXT: .LBB78_16:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
+; SSE-NEXT: addss %xmm1, %xmm1
+; SSE-NEXT: .LBB78_18:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE-NEXT: movd %xmm3, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_19
+; SSE-NEXT: # BB#20:
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rax, %xmm3
+; SSE-NEXT: jmp .LBB78_21
+; SSE-NEXT: .LBB78_19:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm3, %xmm3
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
+; SSE-NEXT: addss %xmm3, %xmm3
+; SSE-NEXT: .LBB78_21:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE-NEXT: movd %xmm2, %rax
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $1, %ecx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: js .LBB78_22
+; SSE-NEXT: # BB#23:
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rax, %xmm2
+; SSE-NEXT: jmp .LBB78_24
+; SSE-NEXT: .LBB78_22:
+; SSE-NEXT: shrq %rax
+; SSE-NEXT: orq %rax, %rcx
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
+; SSE-NEXT: addss %xmm2, %xmm2
+; SSE-NEXT: .LBB78_24:
+; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i64_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_1
+; AVX1-NEXT: # BB#2:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX1-NEXT: jmp .LBB78_3
+; AVX1-NEXT: .LBB78_1:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: .LBB78_3:
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_4
+; AVX1-NEXT: # BB#5:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: jmp .LBB78_6
+; AVX1-NEXT: .LBB78_4:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: .LBB78_6:
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vmovq %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_7
+; AVX1-NEXT: # BB#8:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
+; AVX1-NEXT: jmp .LBB78_9
+; AVX1-NEXT: .LBB78_7:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
+; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: .LBB78_9:
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_10
+; AVX1-NEXT: # BB#11:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB78_12
+; AVX1-NEXT: .LBB78_10:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB78_12:
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_13
+; AVX1-NEXT: # BB#14:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX1-NEXT: jmp .LBB78_15
+; AVX1-NEXT: .LBB78_13:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
+; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: .LBB78_15:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_16
+; AVX1-NEXT: # BB#17:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX1-NEXT: jmp .LBB78_18
+; AVX1-NEXT: .LBB78_16:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: .LBB78_18:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vmovq %xmm4, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_19
+; AVX1-NEXT: # BB#20:
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX1-NEXT: jmp .LBB78_21
+; AVX1-NEXT: .LBB78_19:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX1-NEXT: .LBB78_21:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX1-NEXT: vpextrq $1, %xmm4, %rax
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: andl $1, %ecx
+; AVX1-NEXT: testq %rax, %rax
+; AVX1-NEXT: js .LBB78_22
+; AVX1-NEXT: # BB#23:
+; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX1-NEXT: jmp .LBB78_24
+; AVX1-NEXT: .LBB78_22:
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: .LBB78_24:
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i64_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_1
+; AVX2-NEXT: # BB#2:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
+; AVX2-NEXT: jmp .LBB78_3
+; AVX2-NEXT: .LBB78_1:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
+; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: .LBB78_3:
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_4
+; AVX2-NEXT: # BB#5:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: jmp .LBB78_6
+; AVX2-NEXT: .LBB78_4:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: .LBB78_6:
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_7
+; AVX2-NEXT: # BB#8:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
+; AVX2-NEXT: jmp .LBB78_9
+; AVX2-NEXT: .LBB78_7:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
+; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: .LBB78_9:
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_10
+; AVX2-NEXT: # BB#11:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB78_12
+; AVX2-NEXT: .LBB78_10:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB78_12:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_13
+; AVX2-NEXT: # BB#14:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX2-NEXT: jmp .LBB78_15
+; AVX2-NEXT: .LBB78_13:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
+; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: .LBB78_15:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_16
+; AVX2-NEXT: # BB#17:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
+; AVX2-NEXT: jmp .LBB78_18
+; AVX2-NEXT: .LBB78_16:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
+; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: .LBB78_18:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_19
+; AVX2-NEXT: # BB#20:
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
+; AVX2-NEXT: jmp .LBB78_21
+; AVX2-NEXT: .LBB78_19:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
+; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
+; AVX2-NEXT: .LBB78_21:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX2-NEXT: vpextrq $1, %xmm4, %rax
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $1, %ecx
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: js .LBB78_22
+; AVX2-NEXT: # BB#23:
+; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
+; AVX2-NEXT: jmp .LBB78_24
+; AVX2-NEXT: .LBB78_22:
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
+; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: .LBB78_24:
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i64>, <8 x i64> *%a
+ %cvt = uitofp <8 x i64> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
+; SSE-LABEL: uitofp_load_8i32_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movdqa 16(%rdi), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT: por %xmm4, %xmm3
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
+; SSE-NEXT: por %xmm5, %xmm0
+; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE-NEXT: addps %xmm6, %xmm0
+; SSE-NEXT: addps %xmm3, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: por %xmm4, %xmm2
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: addps %xmm6, %xmm1
+; SSE-NEXT: addps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i32_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i32_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i32>, <8 x i32> *%a
+ %cvt = uitofp <8 x i32> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
+; SSE-LABEL: uitofp_load_8i16_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i16>, <8 x i16> *%a
+ %cvt = uitofp <8 x i16> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
+; SSE-LABEL: uitofp_load_8i8_to_8f32:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: uitofp_load_8i8_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: uitofp_load_8i8_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %ld = load <8 x i8>, <8 x i8> *%a
+ %cvt = uitofp <8 x i8> %ld to <8 x float>
+ ret <8 x float> %cvt
+}
+
+;
; Aggregates
;
diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll
index ecae5d962826..b0d95c5d00da 100644
--- a/test/CodeGen/X86/vec_loadsingles.ll
+++ b/test/CodeGen/X86/vec_loadsingles.ll
@@ -1,22 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=FAST32
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=SLOW32
define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {
+; ALL-LABEL: merge_2_floats:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
%tmp1 = load float, float* %p
%vecins = insertelement <4 x float> undef, float %tmp1, i32 0
%add.ptr = getelementptr float, float* %p, i32 1
%tmp5 = load float, float* %add.ptr
%vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
ret <4 x float> %vecins7
-
-; ALL-LABEL: merge_2_floats
-; ALL: vmovq
-; ALL-NEXT: retq
}
; Test-case generated due to a crash when trying to treat loading the first
; two i64s of a <4 x i64> as a load of two i32s.
define <4 x i64> @merge_2_floats_into_4() {
+; ALL-LABEL: merge_2_floats_into_4:
+; ALL: # BB#0:
+; ALL-NEXT: movq (%rax), %rax
+; ALL-NEXT: vmovups (%rax), %xmm0
+; ALL-NEXT: retq
%1 = load i64*, i64** undef, align 8
%2 = getelementptr inbounds i64, i64* %1, i64 0
%3 = load i64, i64* %2
@@ -27,13 +33,13 @@ define <4 x i64> @merge_2_floats_into_4() {
%8 = insertelement <4 x i64> %4, i64 %7, i32 1
%9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i64> %9
-
-; ALL-LABEL: merge_2_floats_into_4
-; ALL: vmovups
-; ALL-NEXT: retq
}
define <4 x float> @merge_4_floats(float* %ptr) {
+; ALL-LABEL: merge_4_floats:
+; ALL: # BB#0:
+; ALL-NEXT: vmovups (%rdi), %xmm0
+; ALL-NEXT: retq
%a = load float, float* %ptr, align 8
%vec = insertelement <4 x float> undef, float %a, i32 0
%idx1 = getelementptr inbounds float, float* %ptr, i64 1
@@ -46,18 +52,24 @@ define <4 x float> @merge_4_floats(float* %ptr) {
%d = load float, float* %idx5, align 8
%vec6 = insertelement <4 x float> %vec4, float %d, i32 3
ret <4 x float> %vec6
-
-; ALL-LABEL: merge_4_floats
-; ALL: vmovups
-; ALL-NEXT: retq
}
-; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 )
+; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 )
; Make sure that 32-byte vectors are handled efficiently.
; If the target has slow 32-byte accesses, we should still generate
; 16-byte loads.
define <8 x float> @merge_8_floats(float* %ptr) {
+; FAST32-LABEL: merge_8_floats:
+; FAST32: # BB#0:
+; FAST32-NEXT: vmovups (%rdi), %ymm0
+; FAST32-NEXT: retq
+;
+; SLOW32-LABEL: merge_8_floats:
+; SLOW32: # BB#0:
+; SLOW32-NEXT: vmovups (%rdi), %xmm0
+; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; SLOW32-NEXT: retq
%a = load float, float* %ptr, align 4
%vec = insertelement <8 x float> undef, float %a, i32 0
%idx1 = getelementptr inbounds float, float* %ptr, i64 1
@@ -82,18 +94,19 @@ define <8 x float> @merge_8_floats(float* %ptr) {
%h = load float, float* %idx13, align 4
%vec14 = insertelement <8 x float> %vec12, float %h, i32 7
ret <8 x float> %vec14
-
-; ALL-LABEL: merge_8_floats
-
-; FAST32: vmovups
-; FAST32-NEXT: retq
-
-; SLOW32: vmovups
-; SLOW32-NEXT: vinsertf128
-; SLOW32-NEXT: retq
}
define <4 x double> @merge_4_doubles(double* %ptr) {
+; FAST32-LABEL: merge_4_doubles:
+; FAST32: # BB#0:
+; FAST32-NEXT: vmovups (%rdi), %ymm0
+; FAST32-NEXT: retq
+;
+; SLOW32-LABEL: merge_4_doubles:
+; SLOW32: # BB#0:
+; SLOW32-NEXT: vmovups (%rdi), %xmm0
+; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; SLOW32-NEXT: retq
%a = load double, double* %ptr, align 8
%vec = insertelement <4 x double> undef, double %a, i32 0
%idx1 = getelementptr inbounds double, double* %ptr, i64 1
@@ -106,20 +119,22 @@ define <4 x double> @merge_4_doubles(double* %ptr) {
%d = load double, double* %idx5, align 8
%vec6 = insertelement <4 x double> %vec4, double %d, i32 3
ret <4 x double> %vec6
-
-; ALL-LABEL: merge_4_doubles
-; FAST32: vmovups
-; FAST32-NEXT: retq
-
-; SLOW32: vmovups
-; SLOW32-NEXT: vinsertf128
-; SLOW32-NEXT: retq
}
-; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )
+; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )
; Recognize and combine consecutive loads even when the
; first of the combined loads is offset from the base address.
define <4 x double> @merge_4_doubles_offset(double* %ptr) {
+; FAST32-LABEL: merge_4_doubles_offset:
+; FAST32: # BB#0:
+; FAST32-NEXT: vmovups 32(%rdi), %ymm0
+; FAST32-NEXT: retq
+;
+; SLOW32-LABEL: merge_4_doubles_offset:
+; SLOW32: # BB#0:
+; SLOW32-NEXT: vmovups 32(%rdi), %xmm0
+; SLOW32-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
+; SLOW32-NEXT: retq
%arrayidx4 = getelementptr inbounds double, double* %ptr, i64 4
%arrayidx5 = getelementptr inbounds double, double* %ptr, i64 5
%arrayidx6 = getelementptr inbounds double, double* %ptr, i64 6
@@ -133,13 +148,5 @@ define <4 x double> @merge_4_doubles_offset(double* %ptr) {
%vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2
%vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3
ret <4 x double> %vecinit7
-
-; ALL-LABEL: merge_4_doubles_offset
-; FAST32: vmovups
-; FAST32-NEXT: retq
-
-; SLOW32: vmovups
-; SLOW32-NEXT: vinsertf128
-; SLOW32-NEXT: retq
}
diff --git a/test/CodeGen/X86/vec_logical.ll b/test/CodeGen/X86/vec_logical.ll
index 6ab2d8963abd..b632616cde88 100644
--- a/test/CodeGen/X86/vec_logical.ll
+++ b/test/CodeGen/X86/vec_logical.ll
@@ -1,42 +1,87 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep xorps %t | count 2
-; RUN: grep andnps %t
-; RUN: grep movaps %t | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
define void @t(<4 x float> %A) {
- %tmp1277 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %A
- store <4 x float> %tmp1277, <4 x float>* null
- ret void
+; SSE-LABEL: t:
+; SSE: # BB#0:
+; SSE-NEXT: xorps .LCPI0_0, %xmm0
+; SSE-NEXT: movaps %xmm0, 0
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps .LCPI0_0, %xmm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, 0
+; AVX-NEXT: retl
+ %tmp1277 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %A
+ store <4 x float> %tmp1277, <4 x float>* null
+ ret void
}
define <4 x float> @t1(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: t1:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: xorps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t1:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
entry:
- %tmp9 = bitcast <4 x float> %a to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp10 = bitcast <4 x float> %b to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp11 = xor <4 x i32> %tmp9, %tmp10 ; <<4 x i32>> [#uses=1]
- %tmp13 = bitcast <4 x i32> %tmp11 to <4 x float> ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp13
+ %tmp9 = bitcast <4 x float> %a to <4 x i32>
+ %tmp10 = bitcast <4 x float> %b to <4 x i32>
+ %tmp11 = xor <4 x i32> %tmp9, %tmp10
+ %tmp13 = bitcast <4 x i32> %tmp11 to <4 x float>
+ ret <4 x float> %tmp13
}
define <2 x double> @t2(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: t2:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: andps %xmm1, %xmm0
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t2:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retl
entry:
- %tmp9 = bitcast <2 x double> %a to <2 x i64> ; <<2 x i64>> [#uses=1]
- %tmp10 = bitcast <2 x double> %b to <2 x i64> ; <<2 x i64>> [#uses=1]
- %tmp11 = and <2 x i64> %tmp9, %tmp10 ; <<2 x i64>> [#uses=1]
- %tmp13 = bitcast <2 x i64> %tmp11 to <2 x double> ; <<2 x double>> [#uses=1]
- ret <2 x double> %tmp13
+ %tmp9 = bitcast <2 x double> %a to <2 x i64>
+ %tmp10 = bitcast <2 x double> %b to <2 x i64>
+ %tmp11 = and <2 x i64> %tmp9, %tmp10
+ %tmp13 = bitcast <2 x i64> %tmp11 to <2 x double>
+ ret <2 x double> %tmp13
}
define void @t3(<4 x float> %a, <4 x float> %b, <4 x float>* %c, <4 x float>* %d) {
+; SSE-LABEL: t3:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT: andnps %xmm1, %xmm0
+; SSE-NEXT: orps (%ecx), %xmm0
+; SSE-NEXT: movaps %xmm0, (%eax)
+; SSE-NEXT: retl
+;
+; AVX-LABEL: t3:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vorps (%ecx), %xmm0, %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%eax)
+; AVX-NEXT: retl
entry:
- %tmp3 = load <4 x float>, <4 x float>* %c ; <<4 x float>> [#uses=1]
- %tmp11 = bitcast <4 x float> %a to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp12 = bitcast <4 x float> %b to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp13 = xor <4 x i32> %tmp11, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
- %tmp14 = and <4 x i32> %tmp12, %tmp13 ; <<4 x i32>> [#uses=1]
- %tmp27 = bitcast <4 x float> %tmp3 to <4 x i32> ; <<4 x i32>> [#uses=1]
- %tmp28 = or <4 x i32> %tmp14, %tmp27 ; <<4 x i32>> [#uses=1]
- %tmp30 = bitcast <4 x i32> %tmp28 to <4 x float> ; <<4 x float>> [#uses=1]
- store <4 x float> %tmp30, <4 x float>* %d
- ret void
+ %tmp3 = load <4 x float>, <4 x float>* %c
+ %tmp11 = bitcast <4 x float> %a to <4 x i32>
+ %tmp12 = bitcast <4 x float> %b to <4 x i32>
+ %tmp13 = xor <4 x i32> %tmp11, < i32 -1, i32 -1, i32 -1, i32 -1 >
+ %tmp14 = and <4 x i32> %tmp12, %tmp13
+ %tmp27 = bitcast <4 x float> %tmp3 to <4 x i32>
+ %tmp28 = or <4 x i32> %tmp14, %tmp27
+ %tmp30 = bitcast <4 x i32> %tmp28 to <4 x float>
+ store <4 x float> %tmp30, <4 x float>* %d
+ ret void
}
diff --git a/test/CodeGen/X86/vec_partial.ll b/test/CodeGen/X86/vec_partial.ll
index 469667a28a76..e5ac81add7f6 100644
--- a/test/CodeGen/X86/vec_partial.ll
+++ b/test/CodeGen/X86/vec_partial.ll
@@ -1,11 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; PR11580
define <3 x float> @addf3(<3 x float> %x) {
-; CHECK-LABEL: addf3
-; CHECK: # BB#0:
-; CHECK-NEXT: addps .LCPI0_0(%rip), %xmm0
-; CHECK-NEXT: retq
+; CHECK-LABEL: addf3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
+; CHECK-NEXT: retq
entry:
%add = fadd <3 x float> %x, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
ret <3 x float> %add
@@ -13,9 +14,9 @@ entry:
; PR11580
define <4 x float> @cvtf3_f4(<3 x float> %x) {
-; CHECK-LABEL: cvtf3_f4
-; CHECK: # BB#0:
-; CHECK-NEXT: retq
+; CHECK-LABEL: cvtf3_f4:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%extractVec = shufflevector <3 x float> %x, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
ret <4 x float> %extractVec
@@ -23,9 +24,9 @@ entry:
; PR11580
define <3 x float> @cvtf4_f3(<4 x float> %x) {
-; CHECK-LABEL: cvtf4_f3
-; CHECK: # BB#0:
-; CHECK-NEXT: retq
+; CHECK-LABEL: cvtf4_f3:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: retq
entry:
%extractVec = shufflevector <4 x float> %x, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
ret <3 x float> %extractVec
diff --git a/test/CodeGen/X86/vec_sdiv_to_shift.ll b/test/CodeGen/X86/vec_sdiv_to_shift.ll
index 7f71a0c2ea5b..f7151af528b5 100644
--- a/test/CodeGen/X86/vec_sdiv_to_shift.ll
+++ b/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -1,93 +1,286 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=+avx2 | FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
define <8 x i16> @sdiv_vec8x16(<8 x i16> %var) {
+; SSE-LABEL: sdiv_vec8x16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psraw $15, %xmm1
+; SSE-NEXT: psrlw $11, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: psraw $5, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_vec8x16:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $11, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $5, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_vec8x16
-; CHECK: psraw $15
-; CHECK: vpsrlw $11
-; CHECK: vpaddw
-; CHECK: vpsraw $5
-; CHECK: ret
%0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
ret <8 x i16> %0
}
define <8 x i16> @sdiv_vec8x16_minsize(<8 x i16> %var) minsize {
+; SSE-LABEL: sdiv_vec8x16_minsize:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psraw $15, %xmm1
+; SSE-NEXT: psrlw $11, %xmm1
+; SSE-NEXT: paddw %xmm0, %xmm1
+; SSE-NEXT: psraw $5, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_vec8x16_minsize:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $11, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsraw $5, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_vec8x16_minsize
-; CHECK: psraw $15
-; CHECK: vpsrlw $11
-; CHECK: vpaddw
-; CHECK: vpsraw $5
-; CHECK: ret
%0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
ret <8 x i16> %0
}
-
define <4 x i32> @sdiv_zero(<4 x i32> %var) {
+; SSE-LABEL: sdiv_zero:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: pextrd $1, %xmm0, %eax
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: pinsrd $1, %ecx, %xmm1
+; SSE-NEXT: pextrd $2, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: pinsrd $2, %eax, %xmm1
+; SSE-NEXT: pextrd $3, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %esi
+; SSE-NEXT: pinsrd $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_zero:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %esi
+; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_zero
-; CHECK-NOT: sra
-; CHECK: ret
%0 = sdiv <4 x i32> %var, <i32 0, i32 0, i32 0, i32 0>
ret <4 x i32> %0
}
define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) {
+; SSE-LABEL: sdiv_vec4x32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: psrld $28, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: psrad $4, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_vec4x32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $4, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_vec4x32
-; CHECK: vpsrad $31
-; CHECK: vpsrld $28
-; CHECK: vpaddd
-; CHECK: vpsrad $4
-; CHECK: ret
%0 = sdiv <4 x i32> %var, <i32 16, i32 16, i32 16, i32 16>
ret <4 x i32> %0
}
define <4 x i32> @sdiv_negative(<4 x i32> %var) {
+; SSE-LABEL: sdiv_negative:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: psrld $28, %xmm1
+; SSE-NEXT: paddd %xmm0, %xmm1
+; SSE-NEXT: psrad $4, %xmm1
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_negative:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $4, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
entry:
-; CHECK: sdiv_negative
-; CHECK: vpsrad $31
-; CHECK: vpsrld $28
-; CHECK: vpaddd
-; CHECK: vpsrad $4
-; CHECK: vpsubd
-; CHECK: ret
%0 = sdiv <4 x i32> %var, <i32 -16, i32 -16, i32 -16, i32 -16>
ret <4 x i32> %0
}
define <8 x i32> @sdiv8x32(<8 x i32> %var) {
+; SSE-LABEL: sdiv8x32:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psrad $31, %xmm2
+; SSE-NEXT: psrld $26, %xmm2
+; SSE-NEXT: paddd %xmm0, %xmm2
+; SSE-NEXT: psrad $6, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: psrld $26, %xmm3
+; SSE-NEXT: paddd %xmm1, %xmm3
+; SSE-NEXT: psrad $6, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sdiv8x32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $26, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $6, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $26, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrad $6, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sdiv8x32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
+; AVX2-NEXT: vpsrld $26, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $6, %ymm0, %ymm0
+; AVX2-NEXT: retq
entry:
-; CHECK: sdiv8x32
-; CHECK: vpsrad $31
-; CHECK: vpsrld $26
-; CHECK: vpaddd
-; CHECK: vpsrad $6
-; CHECK: ret
%0 = sdiv <8 x i32> %var, <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
ret <8 x i32> %0
}
define <16 x i16> @sdiv16x16(<16 x i16> %var) {
+; SSE-LABEL: sdiv16x16:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psraw $15, %xmm2
+; SSE-NEXT: psrlw $14, %xmm2
+; SSE-NEXT: paddw %xmm0, %xmm2
+; SSE-NEXT: psraw $2, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psraw $15, %xmm3
+; SSE-NEXT: psrlw $14, %xmm3
+; SSE-NEXT: paddw %xmm1, %xmm3
+; SSE-NEXT: psraw $2, %xmm3
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: sdiv16x16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX1-NEXT: vpsrlw $14, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsraw $2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlw $14, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sdiv16x16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
+; AVX2-NEXT: vpsrlw $14, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $2, %ymm0, %ymm0
+; AVX2-NEXT: retq
entry:
-; CHECK: sdiv16x16
-; CHECK: vpsraw $15
-; CHECK: vpsrlw $14
-; CHECK: vpaddw
-; CHECK: vpsraw $2
-; CHECK: ret
%a0 = sdiv <16 x i16> %var, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
ret <16 x i16> %a0
}
-; CHECK: sdiv_non_splat
-; CHECK: idivl
-; CHECK: ret
define <4 x i32> @sdiv_non_splat(<4 x i32> %x) {
+; SSE-LABEL: sdiv_non_splat:
+; SSE: # BB#0:
+; SSE-NEXT: pextrd $1, %xmm0, %eax
+; SSE-NEXT: xorl %ecx, %ecx
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %ecx
+; SSE-NEXT: movd %xmm0, %edx
+; SSE-NEXT: movl %edx, %esi
+; SSE-NEXT: shrl $31, %esi
+; SSE-NEXT: addl %edx, %esi
+; SSE-NEXT: sarl %esi
+; SSE-NEXT: movd %esi, %xmm1
+; SSE-NEXT: pinsrd $1, %eax, %xmm1
+; SSE-NEXT: pextrd $2, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %ecx
+; SSE-NEXT: pinsrd $2, %eax, %xmm1
+; SSE-NEXT: pextrd $3, %xmm0, %eax
+; SSE-NEXT: cltd
+; SSE-NEXT: idivl %ecx
+; SSE-NEXT: pinsrd $3, %eax, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sdiv_non_splat:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: xorl %ecx, %ecx
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %ecx
+; AVX-NEXT: vmovd %xmm0, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: sarl %esi
+; AVX-NEXT: vmovd %esi, %xmm1
+; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %ecx
+; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltd
+; AVX-NEXT: idivl %ecx
+; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT: retq
%y = sdiv <4 x i32> %x, <i32 2, i32 0, i32 0, i32 0>
ret <4 x i32> %y
}
diff --git a/test/CodeGen/X86/vec_set-2.ll b/test/CodeGen/X86/vec_set-2.ll
index a8f1187084d6..02f25d8e35a2 100644
--- a/test/CodeGen/X86/vec_set-2.ll
+++ b/test/CodeGen/X86/vec_set-2.ll
@@ -1,19 +1,27 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movd | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
define <4 x float> @test1(float %a) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1]
- %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
- %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
- %tmp7 = insertelement <4 x float> %tmp6, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp7
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retl
+ %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0
+ %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1
+ %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2
+ %tmp7 = insertelement <4 x float> %tmp6, float 0.000000e+00, i32 3
+ ret <4 x float> %tmp7
}
define <2 x i64> @test(i32 %a) nounwind {
- %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 ; <<8 x i16>> [#uses=1]
- %tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1 ; <<8 x i32>> [#uses=1]
- %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 ; <<8 x i32>> [#uses=1]
- %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3 ; <<8 x i32>> [#uses=1]
- %tmp19 = bitcast <4 x i32> %tmp10 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp19
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retl
+ %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0
+ %tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1
+ %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2
+ %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3
+ %tmp19 = bitcast <4 x i32> %tmp10 to <2 x i64>
+ ret <2 x i64> %tmp19
}
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index a13c813ea7b0..ee4a08599968 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
define <4 x float> @test(float %a) {
; CHECK-LABEL: test:
-; CHECK: insertps $29, {{.*}}, %xmm0
+; CHECK: # BB#0:
+; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero
; CHECK-NEXT: retl
-
-entry:
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
%tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2
%tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3
@@ -14,11 +14,10 @@ entry:
define <2 x i64> @test2(i32 %a) {
; CHECK-LABEL: test2:
-; CHECK: movd {{.*}}, %xmm0
+; CHECK: # BB#0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; CHECK-NEXT: retl
-
-entry:
%tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
%tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3
%tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
@@ -27,9 +26,9 @@ entry:
define <4 x float> @test3(<4 x float> %A) {
; CHECK-LABEL: test3:
-; CHECK: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; CHECK: # BB#0:
+; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
; CHECK-NEXT: retl
-
%tmp0 = extractelement <4 x float> %A, i32 0
%tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
%tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2
diff --git a/test/CodeGen/X86/vec_set-4.ll b/test/CodeGen/X86/vec_set-4.ll
index 332c8b70760f..8f35529d61b4 100644
--- a/test/CodeGen/X86/vec_set-4.ll
+++ b/test/CodeGen/X86/vec_set-4.ll
@@ -1,24 +1,34 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pinsrw | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <2 x i64> @test(i16 %a) nounwind {
-entry:
- %tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3 ; <<8 x i16>> [#uses=1]
- %tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4 ; <<8 x i16>> [#uses=1]
- %tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5 ; <<8 x i16>> [#uses=1]
- %tmp16 = insertelement <8 x i16> %tmp14, i16 0, i32 6 ; <<8 x i16>> [#uses=1]
- %tmp18 = insertelement <8 x i16> %tmp16, i16 0, i32 7 ; <<8 x i16>> [#uses=1]
- %tmp19 = bitcast <8 x i16> %tmp18 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp19
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT: retl
+ %tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3
+ %tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4
+ %tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5
+ %tmp16 = insertelement <8 x i16> %tmp14, i16 0, i32 6
+ %tmp18 = insertelement <8 x i16> %tmp16, i16 0, i32 7
+ %tmp19 = bitcast <8 x i16> %tmp18 to <2 x i64>
+ ret <2 x i64> %tmp19
}
define <2 x i64> @test2(i8 %a) nounwind {
-entry:
- %tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10 ; <<16 x i8>> [#uses=1]
- %tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11 ; <<16 x i8>> [#uses=1]
- %tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12 ; <<16 x i8>> [#uses=1]
- %tmp30 = insertelement <16 x i8> %tmp28, i8 0, i32 13 ; <<16 x i8>> [#uses=1]
- %tmp32 = insertelement <16 x i8> %tmp30, i8 0, i32 14 ; <<16 x i8>> [#uses=1]
- %tmp34 = insertelement <16 x i8> %tmp32, i8 0, i32 15 ; <<16 x i8>> [#uses=1]
- %tmp35 = bitcast <16 x i8> %tmp34 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp35
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: pinsrw $5, %eax, %xmm0
+; CHECK-NEXT: retl
+ %tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10
+ %tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11
+ %tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12
+ %tmp30 = insertelement <16 x i8> %tmp28, i8 0, i32 13
+ %tmp32 = insertelement <16 x i8> %tmp30, i8 0, i32 14
+ %tmp34 = insertelement <16 x i8> %tmp32, i8 0, i32 15
+ %tmp35 = bitcast <16 x i8> %tmp34 to <2 x i64>
+ ret <2 x i64> %tmp35
}
diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll
index 0713d956ee44..4429834b8ef0 100644
--- a/test/CodeGen/X86/vec_set-6.ll
+++ b/test/CodeGen/X86/vec_set-6.ll
@@ -1,12 +1,16 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep movss %t | count 1
-; RUN: grep movq %t | count 1
-; RUN: grep shufps %t | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s
define <4 x float> @test(float %a, float %b, float %c) nounwind {
- %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1]
- %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2 ; <<4 x float>> [#uses=1]
- %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3 ; <<4 x float>> [#uses=1]
- ret <4 x float> %tmp10
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
+; CHECK-NEXT: retl
+ %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
+ %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2
+ %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3
+ ret <4 x float> %tmp10
}
diff --git a/test/CodeGen/X86/vec_set-7.ll b/test/CodeGen/X86/vec_set-7.ll
index 1701e491da66..e8fe6debb140 100644
--- a/test/CodeGen/X86/vec_set-7.ll
+++ b/test/CodeGen/X86/vec_set-7.ll
@@ -1,11 +1,17 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <2 x i64> @test(<2 x i64>* %p) nounwind {
- %tmp = bitcast <2 x i64>* %p to double*
- %tmp.upgrd.1 = load double, double* %tmp
- %tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0
- %tmp5 = insertelement <2 x double> %tmp.upgrd.2, double 0.0, i32 1
- %tmp.upgrd.3 = bitcast <2 x double> %tmp5 to <2 x i64>
- ret <2 x i64> %tmp.upgrd.3
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp = bitcast <2 x i64>* %p to double*
+ %tmp.upgrd.1 = load double, double* %tmp
+ %tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0
+ %tmp5 = insertelement <2 x double> %tmp.upgrd.2, double 0.0, i32 1
+ %tmp.upgrd.3 = bitcast <2 x double> %tmp5 to <2 x i64>
+ ret <2 x i64> %tmp.upgrd.3
}
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index 41061ae7ac23..560e5c568faf 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
-; CHECK-NOT: movsd
-; CHECK: movd {{%rdi|%rcx}}, %xmm0
-; CHECK-NOT: movsd
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s
define <2 x i64> @test(i64 %i) nounwind {
-entry:
- %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
- %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
- ret <2 x i64> %tmp11
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movd %rdi, %xmm0
+; CHECK-NEXT: retq
+ %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
+ %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
+ ret <2 x i64> %tmp11
}
-
diff --git a/test/CodeGen/X86/vec_set-A.ll b/test/CodeGen/X86/vec_set-A.ll
index 92dda4c11b88..cae39a3d775b 100644
--- a/test/CodeGen/X86/vec_set-A.ll
+++ b/test/CodeGen/X86/vec_set-A.ll
@@ -1,7 +1,12 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-; CHECK: movl $1, %{{.*}}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
+
define <2 x i64> @test1() nounwind {
-entry:
- ret <2 x i64> < i64 1, i64 0 >
+; CHECK-LABEL: test1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: retl
+ ret <2 x i64> < i64 1, i64 0 >
}
diff --git a/test/CodeGen/X86/vec_set-B.ll b/test/CodeGen/X86/vec_set-B.ll
index 5578ecaf0007..0580a3376656 100644
--- a/test/CodeGen/X86/vec_set-B.ll
+++ b/test/CodeGen/X86/vec_set-B.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep esp | count 2
-
-; CHECK-NOT: movaps
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
; These should both generate something like this:
;_test3:
@@ -11,16 +9,26 @@
; ret
define <2 x i64> @test3(i64 %arg) nounwind {
-entry:
- %A = and i64 %arg, 1234567
- %B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0
- ret <2 x i64> %B
+; CHECK-LABEL: test3:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687
+; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: retl
+ %A = and i64 %arg, 1234567
+ %B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0
+ ret <2 x i64> %B
}
define <2 x i64> @test2(i64 %arg) nounwind {
-entry:
- %A = and i64 %arg, 1234567
- %B = insertelement <2 x i64> undef, i64 %A, i32 0
- ret <2 x i64> %B
+; CHECK-LABEL: test2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687
+; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: retl
+ %A = and i64 %arg, 1234567
+ %B = insertelement <2 x i64> undef, i64 %A, i32 0
+ ret <2 x i64> %B
}
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index 052da30a6bb8..cbcac34ce4a5 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep movq
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep mov | count 1
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | grep movd
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(i64 %x) nounwind {
- %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
- ret <2 x i64> %tmp8
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movd %rdi, %xmm0
+; X64-NEXT: retq
+ %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
+ ret <2 x i64> %tmp8
}
diff --git a/test/CodeGen/X86/vec_set-D.ll b/test/CodeGen/X86/vec_set-D.ll
index 9c1e1acf0bab..f736a4ab45be 100644
--- a/test/CodeGen/X86/vec_set-D.ll
+++ b/test/CodeGen/X86/vec_set-D.ll
@@ -1,9 +1,12 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-; CHECK: movq
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <4 x i32> @t(i32 %x, i32 %y) nounwind {
- %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
- %tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1
- ret <4 x i32> %tmp2
+; CHECK-LABEL: t:
+; CHECK: # BB#0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
+ %tmp2 = insertelement <4 x i32> %tmp1, i32 %y, i32 1
+ ret <4 x i32> %tmp2
}
diff --git a/test/CodeGen/X86/vec_set-F.ll b/test/CodeGen/X86/vec_set-F.ll
index aa17f9bfbf5c..e69d8f4fc4da 100644
--- a/test/CodeGen/X86/vec_set-F.ll
+++ b/test/CodeGen/X86/vec_set-F.ll
@@ -1,19 +1,27 @@
-; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | grep movq
-; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | grep movsd
-; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | grep mov | count 3
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux -mattr=+sse2 | FileCheck %s
define <2 x i64> @t1(<2 x i64>* %ptr) nounwind {
- %tmp45 = bitcast <2 x i64>* %ptr to <2 x i32>*
- %tmp615 = load <2 x i32>, <2 x i32>* %tmp45
- %tmp7 = bitcast <2 x i32> %tmp615 to i64
- %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %tmp7, i32 0
- ret <2 x i64> %tmp8
+; CHECK-LABEL: t1:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp45 = bitcast <2 x i64>* %ptr to <2 x i32>*
+ %tmp615 = load <2 x i32>, <2 x i32>* %tmp45
+ %tmp7 = bitcast <2 x i32> %tmp615 to i64
+ %tmp8 = insertelement <2 x i64> zeroinitializer, i64 %tmp7, i32 0
+ ret <2 x i64> %tmp8
}
define <2 x i64> @t2(i64 %x) nounwind {
- %tmp717 = bitcast i64 %x to double
- %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0
- %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1
- %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64>
- ret <2 x i64> %tmp11
+; CHECK-LABEL: t2:
+; CHECK: # BB#0:
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retl
+ %tmp717 = bitcast i64 %x to double
+ %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0
+ %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1
+ %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64>
+ ret <2 x i64> %tmp11
}
diff --git a/test/CodeGen/X86/vec_set-H.ll b/test/CodeGen/X86/vec_set-H.ll
index 5037e36d3fd5..af8ac70c5b3d 100644
--- a/test/CodeGen/X86/vec_set-H.ll
+++ b/test/CodeGen/X86/vec_set-H.ll
@@ -1,15 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movz
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s
define <2 x i64> @doload64(i16 signext %x) nounwind {
-entry:
- %tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0 ; <<8 x i16>> [#uses=1]
- %tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1 ; <<8 x i16>> [#uses=1]
- %tmp38 = insertelement <8 x i16> %tmp37, i16 %x, i32 2 ; <<8 x i16>> [#uses=1]
- %tmp39 = insertelement <8 x i16> %tmp38, i16 %x, i32 3 ; <<8 x i16>> [#uses=1]
- %tmp40 = insertelement <8 x i16> %tmp39, i16 %x, i32 4 ; <<8 x i16>> [#uses=1]
- %tmp41 = insertelement <8 x i16> %tmp40, i16 %x, i32 5 ; <<8 x i16>> [#uses=1]
- %tmp42 = insertelement <8 x i16> %tmp41, i16 %x, i32 6 ; <<8 x i16>> [#uses=1]
- %tmp43 = insertelement <8 x i16> %tmp42, i16 %x, i32 7 ; <<8 x i16>> [#uses=1]
- %tmp46 = bitcast <8 x i16> %tmp43 to <2 x i64> ; <<2 x i64>> [#uses=1]
- ret <2 x i64> %tmp46
+; CHECK-LABEL: doload64:
+; CHECK: # BB#0:
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: retl
+ %tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0
+ %tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1
+ %tmp38 = insertelement <8 x i16> %tmp37, i16 %x, i32 2
+ %tmp39 = insertelement <8 x i16> %tmp38, i16 %x, i32 3
+ %tmp40 = insertelement <8 x i16> %tmp39, i16 %x, i32 4
+ %tmp41 = insertelement <8 x i16> %tmp40, i16 %x, i32 5
+ %tmp42 = insertelement <8 x i16> %tmp41, i16 %x, i32 6
+ %tmp43 = insertelement <8 x i16> %tmp42, i16 %x, i32 7
+ %tmp46 = bitcast <8 x i16> %tmp43 to <2 x i64>
+ ret <2 x i64> %tmp46
}
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index 53d880b4bbdd..49bd3beef75a 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,15 +1,36 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep punpckl | count 7
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s
define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
- %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0 ; <<8 x i16>> [#uses=1]
- %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1 ; <<8 x i16>> [#uses=1]
- %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2 ; <<8 x i16>> [#uses=1]
- %tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3 ; <<8 x i16>> [#uses=1]
- %tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4 ; <<8 x i16>> [#uses=1]
- %tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5 ; <<8 x i16>> [#uses=1]
- %tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6 ; <<8 x i16>> [#uses=1]
- %tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7 ; <<8 x i16>> [#uses=1]
- store <8 x i16> %tmp14, <8 x i16>* %b
- ret void
+; CHECK-LABEL: test:
+; CHECK: # BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; CHECK-NEXT: movdqa %xmm3, (%eax)
+; CHECK-NEXT: retl
+ %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0
+ %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1
+ %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2
+ %tmp6 = insertelement <8 x i16> %tmp4, i16 %a3, i32 3
+ %tmp8 = insertelement <8 x i16> %tmp6, i16 %a4, i32 4
+ %tmp10 = insertelement <8 x i16> %tmp8, i16 %a5, i32 5
+ %tmp12 = insertelement <8 x i16> %tmp10, i16 %a6, i32 6
+ %tmp14 = insertelement <8 x i16> %tmp12, i16 %a7, i32 7
+ store <8 x i16> %tmp14, <8 x i16>* %b
+ ret void
}
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index b69f90cd6e2f..1eef0be2dbbb 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -1,179 +1,199 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse4.1 | FileCheck %s -check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx | FileCheck %s -check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
define <16 x i8> @v16i8_icmp_uge(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
+; SSE-LABEL: v16i8_icmp_uge:
+; SSE: # BB#0:
+; SSE-NEXT: pmaxub %xmm0, %xmm1
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v16i8_icmp_uge:
+; AVX: # BB#0:
+; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = icmp uge <16 x i8> %a, %b
%2 = sext <16 x i1> %1 to <16 x i8>
ret <16 x i8> %2
-; SSE2-LABEL: v16i8_icmp_uge:
-; SSE2: pmaxub %xmm0, %xmm1
-; SSE2: pcmpeqb %xmm1, %xmm0
-
-; SSE41-LABEL: v16i8_icmp_uge:
-; SSE41: pmaxub %xmm0, %xmm1
-; SSE41: pcmpeqb %xmm1, %xmm0
-
-; AVX-LABEL: v16i8_icmp_uge:
-; AVX: vpmaxub %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqb %xmm1, %xmm0, %xmm0
}
define <16 x i8> @v16i8_icmp_ule(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
+; SSE-LABEL: v16i8_icmp_ule:
+; SSE: # BB#0:
+; SSE-NEXT: pminub %xmm0, %xmm1
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: v16i8_icmp_ule:
+; AVX: # BB#0:
+; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%1 = icmp ule <16 x i8> %a, %b
%2 = sext <16 x i1> %1 to <16 x i8>
ret <16 x i8> %2
-; SSE2-LABEL: v16i8_icmp_ule:
-; SSE2: pminub %xmm0, %xmm1
-; SSE2: pcmpeqb %xmm1, %xmm0
-
-; SSE41-LABEL: v16i8_icmp_ule:
-; SSE41: pminub %xmm0, %xmm1
-; SSE41: pcmpeqb %xmm1, %xmm0
-
-; AVX-LABEL: v16i8_icmp_ule:
-; AVX: vpminub %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqb %xmm1, %xmm0, %xmm0
}
-
define <8 x i16> @v8i16_icmp_uge(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
- %1 = icmp uge <8 x i16> %a, %b
- %2 = sext <8 x i1> %1 to <8 x i16>
- ret <8 x i16> %2
; SSE2-LABEL: v8i16_icmp_uge:
-; SSE2: psubusw %xmm0, %xmm1
-; SEE2: pxor %xmm0, %xmm0
-; SSE2: pcmpeqw %xmm1, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: psubusw %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v8i16_icmp_uge:
-; SSE41: pmaxuw %xmm0, %xmm1
-; SSE41: pcmpeqw %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pmaxuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v8i16_icmp_uge:
-; AVX: vpmaxuw %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp uge <8 x i16> %a, %b
+ %2 = sext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
}
define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
- %1 = icmp ule <8 x i16> %a, %b
- %2 = sext <8 x i1> %1 to <8 x i16>
- ret <8 x i16> %2
; SSE2-LABEL: v8i16_icmp_ule:
-; SSE2: psubusw %xmm1, %xmm0
-; SSE2: pxor %xmm1, %xmm1
-; SSE2: pcmpeqw %xmm1, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: psubusw %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v8i16_icmp_ule:
-; SSE41: pminuw %xmm0, %xmm1
-; SSE41: pcmpeqw %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pminuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v8i16_icmp_ule:
-; AVX: vpminuw %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp ule <8 x i16> %a, %b
+ %2 = sext <8 x i1> %1 to <8 x i16>
+ ret <8 x i16> %2
}
-
define <4 x i32> @v4i32_icmp_uge(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
- %1 = icmp uge <4 x i32> %a, %b
- %2 = sext <4 x i1> %1 to <4 x i32>
- ret <4 x i32> %2
; SSE2-LABEL: v4i32_icmp_uge:
-; SSE2: movdqa {{.*}}(%rip), %xmm2
-; SSE2: pxor %xmm2, %xmm0
-; SSE2: pxor %xmm1, %xmm2
-; SSE2: pcmpgtd %xmm0, %xmm2
-; SSE2: pcmpeqd %xmm0, %xmm0
-; SSE2: pxor %xmm2, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v4i32_icmp_uge:
-; SSE41: pmaxud %xmm0, %xmm1
-; SSE41: pcmpeqd %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pmaxud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v4i32_icmp_uge:
-; AVX: vpmaxud %xmm1, %xmm0, %xmm1
-; AVX: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp uge <4 x i32> %a, %b
+ %2 = sext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
}
define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
- %1 = icmp ule <4 x i32> %a, %b
- %2 = sext <4 x i1> %1 to <4 x i32>
- ret <4 x i32> %2
; SSE2-LABEL: v4i32_icmp_ule:
-; SSE2: movdqa {{.*}}(%rip), %xmm2
-; SSE2: pxor %xmm2, %xmm1
-; SSE2: pxor %xmm2, %xmm0
-; SSE2: pcmpgtd %xmm1, %xmm0
-; SSE2: pcmpeqd %xmm1, %xmm1
-; SSE2: pxor %xmm1, %xmm0
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: v4i32_icmp_ule:
-; SSE41: pminud %xmm0, %xmm1
-; SSE41: pcmpeqd %xmm1, %xmm0
-
+; SSE41: # BB#0:
+; SSE41-NEXT: pminud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
; AVX-LABEL: v4i32_icmp_ule:
-; AVX: pminud %xmm1, %xmm0, %xmm1
-; AVX: pcmpeqd %xmm1, %xmm0, %xmm0
+; AVX: # BB#0:
+; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp ule <4 x i32> %a, %b
+ %2 = sext <4 x i1> %1 to <4 x i32>
+ ret <4 x i32> %2
}
; At one point we were incorrectly constant-folding a setcc to 0x1 instead of
; 0xff, leading to a constpool load. The instruction doesn't matter here, but it
; should set all bits to 1.
define <16 x i8> @test_setcc_constfold_vi8(<16 x i8> %l, <16 x i8> %r) {
+; SSE-LABEL: test_setcc_constfold_vi8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_setcc_constfold_vi8:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%test1 = icmp eq <16 x i8> %l, %r
%mask1 = sext <16 x i1> %test1 to <16 x i8>
-
%test2 = icmp ne <16 x i8> %l, %r
%mask2 = sext <16 x i1> %test2 to <16 x i8>
-
%res = or <16 x i8> %mask1, %mask2
ret <16 x i8> %res
-; SSE2-LABEL: test_setcc_constfold_vi8:
-; SSE2: pcmpeqd %xmm0, %xmm0
-
-; SSE41-LABEL: test_setcc_constfold_vi8:
-; SSE41: pcmpeqd %xmm0, %xmm0
-
-; AVX-LABEL: test_setcc_constfold_vi8:
-; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
}
; Make sure sensible results come from doing extension afterwards
define <16 x i8> @test_setcc_constfold_vi1(<16 x i8> %l, <16 x i8> %r) {
+; SSE-LABEL: test_setcc_constfold_vi1:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_setcc_constfold_vi1:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%test1 = icmp eq <16 x i8> %l, %r
%test2 = icmp ne <16 x i8> %l, %r
-
%res = or <16 x i1> %test1, %test2
%mask = sext <16 x i1> %res to <16 x i8>
ret <16 x i8> %mask
-; SSE2-LABEL: test_setcc_constfold_vi1:
-; SSE2: pcmpeqd %xmm0, %xmm0
-
-; SSE41-LABEL: test_setcc_constfold_vi1:
-; SSE41: pcmpeqd %xmm0, %xmm0
-
-; AVX-LABEL: test_setcc_constfold_vi1:
-; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
}
-
; 64-bit case is also particularly important, as the constant "-1" is probably
; just 32-bits wide.
define <2 x i64> @test_setcc_constfold_vi64(<2 x i64> %l, <2 x i64> %r) {
+; SSE-LABEL: test_setcc_constfold_vi64:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_setcc_constfold_vi64:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
%test1 = icmp eq <2 x i64> %l, %r
%mask1 = sext <2 x i1> %test1 to <2 x i64>
-
%test2 = icmp ne <2 x i64> %l, %r
%mask2 = sext <2 x i1> %test2 to <2 x i64>
-
%res = or <2 x i64> %mask1, %mask2
ret <2 x i64> %res
-; SSE2-LABEL: test_setcc_constfold_vi64:
-; SSE2: pcmpeqd %xmm0, %xmm0
-
-; SSE41-LABEL: test_setcc_constfold_vi64:
-; SSE41: pcmpeqd %xmm0, %xmm0
-
-; AVX-LABEL: test_setcc_constfold_vi64:
-; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
}
diff --git a/test/CodeGen/X86/vec_shift.ll b/test/CodeGen/X86/vec_shift.ll
index ddf0469b72a7..55b55936634d 100644
--- a/test/CodeGen/X86/vec_shift.ll
+++ b/test/CodeGen/X86/vec_shift.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psllw
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psrlq
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psraw
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0: # %entry
+; X32-NEXT: psllw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllw %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp6 = bitcast <2 x i64> %c to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp8 = bitcast <2 x i64> %b1 to <8 x i16> ; <<8 x i16>> [#uses=1]
@@ -12,6 +21,17 @@ entry:
}
define <2 x i64> @t3(<2 x i64> %b1, i32 %c) nounwind {
+; X32-LABEL: t3:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: psraw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: psraw %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp2 = bitcast <2 x i64> %b1 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp4 = insertelement <4 x i32> undef, i32 %c, i32 0 ; <<4 x i32>> [#uses=1]
@@ -21,14 +41,23 @@ entry:
ret <2 x i64> %tmp11
}
-declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0: # %entry
+; X32-NEXT: psrlq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0: # %entry
+; X64-NEXT: psrlq %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp9 = tail call <2 x i64> @llvm.x86.sse2.psrl.q( <2 x i64> %b1, <2 x i64> %c ) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp9
}
-declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
-declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/vec_shift2.ll b/test/CodeGen/X86/vec_shift2.ll
index c5f9dc4ace32..21d599fead08 100644
--- a/test/CodeGen/X86/vec_shift2.ll
+++ b/test/CodeGen/X86/vec_shift2.ll
@@ -1,6 +1,21 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep CPI
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0:
+; X32-NEXT: movl $14, %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: psrlw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0:
+; X64-NEXT: movl $14, %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: psrlw %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = bitcast <2 x i64> %b1 to <8 x i16>
%tmp2 = tail call <8 x i16> @llvm.x86.sse2.psrl.w( <8 x i16> %tmp1, <8 x i16> bitcast (<4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > to <8 x i16>) ) nounwind readnone
%tmp3 = bitcast <8 x i16> %tmp2 to <2 x i64>
@@ -8,10 +23,23 @@ define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
}
define <4 x i32> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0:
+; X32-NEXT: movl $14, %eax
+; X32-NEXT: movd %eax, %xmm1
+; X32-NEXT: pslld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0:
+; X64-NEXT: movl $14, %eax
+; X64-NEXT: movd %eax, %xmm1
+; X64-NEXT: pslld %xmm1, %xmm0
+; X64-NEXT: retq
%tmp1 = bitcast <2 x i64> %b1 to <4 x i32>
%tmp2 = tail call <4 x i32> @llvm.x86.sse2.psll.d( <4 x i32> %tmp1, <4 x i32> < i32 14, i32 undef, i32 undef, i32 undef > ) nounwind readnone
ret <4 x i32> %tmp2
}
-declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/vec_shift3.ll b/test/CodeGen/X86/vec_shift3.ll
index 1ebf455c0555..071f0d38b96d 100644
--- a/test/CodeGen/X86/vec_shift3.ll
+++ b/test/CodeGen/X86/vec_shift3.ll
@@ -1,20 +1,51 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psllq
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep psraw
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movd | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <2 x i64> @t1(<2 x i64> %x1, i32 %bits) nounwind {
+; X32-LABEL: t1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: psllq %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: psllq %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp3 = tail call <2 x i64> @llvm.x86.sse2.pslli.q( <2 x i64> %x1, i32 %bits ) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp3
}
define <2 x i64> @t2(<2 x i64> %x1) nounwind {
+; X32-LABEL: t2:
+; X32: # BB#0: # %entry
+; X32-NEXT: psllq $10, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t2:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllq $10, %xmm0
+; X64-NEXT: retq
entry:
%tmp3 = tail call <2 x i64> @llvm.x86.sse2.pslli.q( <2 x i64> %x1, i32 10 ) nounwind readnone ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp3
}
define <2 x i64> @t3(<2 x i64> %x1, i32 %bits) nounwind {
+; X32-LABEL: t3:
+; X32: # BB#0: # %entry
+; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: psraw %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: t3:
+; X64: # BB#0: # %entry
+; X64-NEXT: movd %edi, %xmm1
+; X64-NEXT: psraw %xmm1, %xmm0
+; X64-NEXT: retq
entry:
%tmp2 = bitcast <2 x i64> %x1 to <8 x i16> ; <<8 x i16>> [#uses=1]
%tmp4 = tail call <8 x i16> @llvm.x86.sse2.psrai.w( <8 x i16> %tmp2, i32 %bits ) nounwind readnone ; <<8 x i16>> [#uses=1]
@@ -22,5 +53,5 @@ entry:
ret <2 x i64> %tmp5
}
-declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
-declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index b266a6987557..66229361990f 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -1,6 +1,23 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
+; X32-LABEL: shl1:
+; X32: # BB#0: # %entry
+; X32-NEXT: pslld $23, %xmm1
+; X32-NEXT: paddd {{\.LCPI.*}}, %xmm1
+; X32-NEXT: cvttps2dq %xmm1, %xmm1
+; X32-NEXT: pmulld %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: shl1:
+; X64: # BB#0: # %entry
+; X64-NEXT: pslld $23, %xmm1
+; X64-NEXT: paddd {{.*}}(%rip), %xmm1
+; X64-NEXT: cvttps2dq %xmm1, %xmm1
+; X64-NEXT: pmulld %xmm1, %xmm0
+; X64-NEXT: retq
entry:
; CHECK-NOT: shll
; CHECK: pslld
@@ -14,6 +31,51 @@ entry:
}
define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
+; X32-LABEL: shl2:
+; X32: # BB#0: # %entry
+; X32-NEXT: movdqa %xmm0, %xmm2
+; X32-NEXT: psllw $5, %xmm1
+; X32-NEXT: movdqa %xmm2, %xmm3
+; X32-NEXT: psllw $4, %xmm3
+; X32-NEXT: pand {{\.LCPI.*}}, %xmm3
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: pblendvb %xmm3, %xmm2
+; X32-NEXT: movdqa %xmm2, %xmm3
+; X32-NEXT: psllw $2, %xmm3
+; X32-NEXT: pand {{\.LCPI.*}}, %xmm3
+; X32-NEXT: paddb %xmm1, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: pblendvb %xmm3, %xmm2
+; X32-NEXT: movdqa %xmm2, %xmm3
+; X32-NEXT: paddb %xmm3, %xmm3
+; X32-NEXT: paddb %xmm1, %xmm1
+; X32-NEXT: movdqa %xmm1, %xmm0
+; X32-NEXT: pblendvb %xmm3, %xmm2
+; X32-NEXT: movdqa %xmm2, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: shl2:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: psllw $5, %xmm1
+; X64-NEXT: movdqa %xmm2, %xmm3
+; X64-NEXT: psllw $4, %xmm3
+; X64-NEXT: pand {{.*}}(%rip), %xmm3
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: pblendvb %xmm3, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm3
+; X64-NEXT: psllw $2, %xmm3
+; X64-NEXT: pand {{.*}}(%rip), %xmm3
+; X64-NEXT: paddb %xmm1, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: pblendvb %xmm3, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm3
+; X64-NEXT: paddb %xmm3, %xmm3
+; X64-NEXT: paddb %xmm1, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: pblendvb %xmm3, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: retq
entry:
; CHECK-NOT: shlb
; CHECK: pblendvb
diff --git a/test/CodeGen/X86/vec_shift5.ll b/test/CodeGen/X86/vec_shift5.ll
index 499aa22de52d..cba2b5d05041 100644
--- a/test/CodeGen/X86/vec_shift5.ll
+++ b/test/CodeGen/X86/vec_shift5.ll
@@ -1,153 +1,238 @@
-; RUN: llc -march=x86-64 -mattr=+sse2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; Verify that we correctly fold target specific packed vector shifts by
; immediate count into a simple build_vector when the elements of the vector
; in input to the packed shift are all constants or undef.
define <8 x i16> @test1() {
+; X32-LABEL: test1:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64]
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64]
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test1
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test2() {
+; X32-LABEL: test2:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test2
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test3() {
+; X32-LABEL: test3:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test3
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test4() {
+; X32-LABEL: test4:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64]
+; X32-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64]
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test4
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test5() {
+; X32-LABEL: test5:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test5:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test5
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test6() {
+; X32-LABEL: test6:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X32-NEXT: retl
+;
+; X64-LABEL: test6:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test6
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test7() {
+; X32-LABEL: test7:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = [1,0,2,0]
+; X32-NEXT: psllq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test7:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16]
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test7
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test8() {
+; X32-LABEL: test8:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = [8,0,16,0]
+; X32-NEXT: psrlq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test8:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [1,2]
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test8
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test9() {
+; X32-LABEL: test9:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT: retl
+;
+; X64-LABEL: test9:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test9
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test10() {
+; X32-LABEL: test10:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X32-NEXT: retl
+;
+; X64-LABEL: test10:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test10
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test11() {
+; X32-LABEL: test11:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = <u,u,31,0>
+; X32-NEXT: psrlq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test11:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,3>
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test11
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test12() {
+; X32-LABEL: test12:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT: retl
+;
+; X64-LABEL: test12:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test12
-; CHECK-NOT: psra
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test13() {
+; X32-LABEL: test13:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X32-NEXT: retl
+;
+; X64-LABEL: test13:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test13
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <8 x i16> @test14() {
+; X32-LABEL: test14:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X32-NEXT: retl
+;
+; X64-LABEL: test14:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
+; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
ret <8 x i16> %1
}
-; CHECK-LABEL: test14
-; CHECK-NOT: psrl
-; CHECK: movaps
-; CHECK-NEXT: ret
define <4 x i32> @test15() {
+; X32-LABEL: test15:
+; X32: # BB#0:
+; X32-NEXT: movaps {{.*#+}} xmm0 = <u,64,u,256>
+; X32-NEXT: retl
+;
+; X64-LABEL: test15:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,64,u,256>
+; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
ret <4 x i32> %1
}
-; CHECK-LABEL: test15
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
define <2 x i64> @test16() {
+; X32-LABEL: test16:
+; X32: # BB#0:
+; X32-NEXT: movdqa {{.*#+}} xmm0 = <u,u,31,0>
+; X32-NEXT: psllq $3, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test16:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = <u,248>
+; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
ret <2 x i64> %1
}
-; CHECK-LABEL: test16
-; CHECK-NOT: psll
-; CHECK: movaps
-; CHECK-NEXT: ret
-
declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
index b71f9893a9db..c4b7f204be69 100644
--- a/test/CodeGen/X86/vec_shift6.ll
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -1,134 +1,229 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefix=AVX512
; Verify that we don't scalarize a packed vector shift left of 16-bit
; signed integers if the amount is a constant build_vector.
; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
define <8 x i16> @test1(<8 x i16> %a) {
+; SSE-LABEL: test1:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test1:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
ret <8 x i16> %shl
}
-; CHECK-LABEL: test1
-; CHECK: pmullw
-; CHECK-NEXT: ret
-
define <8 x i16> @test2(<8 x i16> %a) {
+; SSE-LABEL: test2:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test2:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test2:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
ret <8 x i16> %shl
}
-; CHECK-LABEL: test2
-; CHECK: pmullw
-; CHECK-NEXT: ret
-
; Verify that a vector shift left of 32-bit signed integers is simply expanded
; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
; counts is a constant build_vector.
define <4 x i32> @test3(<4 x i32> %a) {
+; SSE-LABEL: test3:
+; SSE: # BB#0:
+; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test3:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test3:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
ret <4 x i32> %shl
}
-; CHECK-LABEL: test3
-; CHECK-NOT: cvttps2dq
-; SSE: pmulld
-; AVX2: vpsllvd
-; CHECK-NEXT: ret
-
define <4 x i32> @test4(<4 x i32> %a) {
+; SSE-LABEL: test4:
+; SSE: # BB#0:
+; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test4:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
ret <4 x i32> %shl
}
-; CHECK-LABEL: test4
-; CHECK-NOT: cvttps2dq
-; SSE: pmulld
-; AVX2: vpsllvd
-; CHECK-NEXT: ret
-
; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
; into two pmullw instructions. With AVX2, the test case below would produce
; a single vpmullw.
define <16 x i16> @test5(<16 x i16> %a) {
+; SSE-LABEL: test5:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test5:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test5:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
ret <16 x i16> %shl
}
-; CHECK-LABEL: test5
-; SSE: pmullw
-; SSE-NEXT: pmullw
-; AVX2: vpmullw
-; AVX2-NOT: vpmullw
-; CHECK: ret
-
; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
; into two pmulld instructions. With AVX2, the test case below would produce
; a single vpsllvd instead.
define <8 x i32> @test6(<8 x i32> %a) {
+; SSE-LABEL: test6:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
+; SSE-NEXT: pmulld %xmm2, %xmm0
+; SSE-NEXT: pmulld %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test6:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test6:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
ret <8 x i32> %shl
}
-; CHECK-LABEL: test6
-; SSE: pmulld
-; SSE-NEXT: pmulld
-; AVX2: vpsllvd
-; CHECK: ret
-
; With AVX2 and AVX512, the test case below should produce a sequence of
; two vpmullw instructions. On SSE2 instead, we split the shift in four
; parts and then we convert each part into a pmullw.
define <32 x i16> @test7(<32 x i16> %a) {
+; SSE-LABEL: test7:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
+; SSE-NEXT: pmullw %xmm4, %xmm0
+; SSE-NEXT: pmullw %xmm4, %xmm1
+; SSE-NEXT: pmullw %xmm4, %xmm2
+; SSE-NEXT: pmullw %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test7:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test7:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: retq
%shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
ret <32 x i16> %shl
}
-; CHECK-LABEL: test7
-; SSE: pmullw
-; SSE-NEXT: pmullw
-; SSE-NEXT: pmullw
-; SSE-NEXT: pmullw
-; AVX2: vpmullw
-; AVX2-NEXT: vpmullw
-; CHECK: ret
-
; Similar to test7; the difference is that with AVX512 support
; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
define <16 x i32> @test8(<16 x i32> %a) {
+; SSE-LABEL: test8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
+; SSE-NEXT: pmulld %xmm4, %xmm0
+; SSE-NEXT: pmulld %xmm4, %xmm1
+; SSE-NEXT: pmulld %xmm4, %xmm2
+; SSE-NEXT: pmulld %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
+; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: retq
%shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
ret <16 x i32> %shl
}
-; CHECK-LABEL: test8
-; SSE: pmulld
-; SSE-NEXT: pmulld
-; SSE-NEXT: pmulld
-; SSE-NEXT: pmulld
-; AVX2ONLY: vpsllvd
-; AVX2ONLY-NEXT: vpsllvd
-; AVX512: vpsllvd
-; AVX512-NOT: vpsllvd
-; CHECK: ret
-
-; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support.
+; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support.
define <8 x i64> @test9(<8 x i64> %a) {
+; SSE-LABEL: test9:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psllq $3, %xmm4
+; SSE-NEXT: psllq $2, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psllq $3, %xmm4
+; SSE-NEXT: psllq $2, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; SSE-NEXT: paddq %xmm0, %xmm0
+; SSE-NEXT: paddq %xmm2, %xmm2
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test9:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
+; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test9:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: retq
%shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
ret <8 x i64> %shl
}
-; CHECK-LABEL: test9
-; AVX2ONLY: vpsllvq
-; AVX2ONLY-NEXT: vpsllvq
-; AVX512: vpsllvq
-; AVX512-NOT: vpsllvq
-; CHECK: ret
-
diff --git a/test/CodeGen/X86/vec_shift7.ll b/test/CodeGen/X86/vec_shift7.ll
index cdf828976be4..80d72a4a986f 100644
--- a/test/CodeGen/X86/vec_shift7.ll
+++ b/test/CodeGen/X86/vec_shift7.ll
@@ -1,12 +1,29 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; Verify that we don't fail when shift by zero is encountered.
define i64 @test1(<2 x i64> %a) {
+; X32-LABEL: test1:
+; X32: # BB#0: # %entry
+; X32-NEXT: movdqa %xmm0, %xmm1
+; X32-NEXT: psllq $2, %xmm1
+; X32-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X32-NEXT: movd %xmm1, %eax
+; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X32-NEXT: movd %xmm0, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psllq $2, %xmm1
+; X64-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; X64-NEXT: movd %xmm1, %rax
+; X64-NEXT: retq
entry:
%c = shl <2 x i64> %a, <i64 0, i64 2>
%d = extractelement <2 x i64> %c, i32 0
ret i64 %d
}
-; CHECK-LABEL: test1
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index ab5031e267dc..076f748009b3 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -1,39 +1,55 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s
target datalayout = "e-p:32:32"
-target triple = "i686-apple-darwin8.7.2"
define i16 @test1(float %f) nounwind {
- %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
- %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
- %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
- %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
- %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
- %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
- %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
- %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1]
- %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
- %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1]
- ret i16 %tmp69
; CHECK-LABEL: test1:
-; CHECK: subss LCPI0_
-; CHECK: mulss LCPI0_
-; CHECK: minss LCPI0_
+; CHECK: ## BB#0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: subss LCPI0_0, %xmm0
+; CHECK-NEXT: mulss LCPI0_1, %xmm0
+; CHECK-NEXT: minss LCPI0_2, %xmm0
+; CHECK-NEXT: maxss %xmm1, %xmm0
+; CHECK-NEXT: cvttss2si %xmm0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retl
+;
+ %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
+ %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
+ %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
+ %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
+ %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
+ %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
+ %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
+ %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1]
+ %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
+ %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1]
+ ret i16 %tmp69
}
define i16 @test2(float %f) nounwind {
- %tmp28 = fsub float %f, 1.000000e+00 ; <float> [#uses=1]
- %tmp37 = fmul float %tmp28, 5.000000e-01 ; <float> [#uses=1]
- %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0 ; <<4 x float>> [#uses=1]
- %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
- %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
- %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
- %tmp69 = trunc i32 %tmp to i16 ; <i16> [#uses=1]
- ret i16 %tmp69
; CHECK-LABEL: test2:
-; CHECK: addss LCPI1_
-; CHECK: mulss LCPI1_
-; CHECK: minss LCPI1_
+; CHECK: ## BB#0:
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: addss LCPI1_0, %xmm0
+; CHECK-NEXT: mulss LCPI1_1, %xmm0
+; CHECK-NEXT: minss LCPI1_2, %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: maxss %xmm1, %xmm0
+; CHECK-NEXT: cvttss2si %xmm0, %eax
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: retl
+;
+ %tmp28 = fsub float %f, 1.000000e+00 ; <float> [#uses=1]
+ %tmp37 = fmul float %tmp28, 5.000000e-01 ; <float> [#uses=1]
+ %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0 ; <<4 x float>> [#uses=1]
+ %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
+ %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1]
+ %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
+ %tmp69 = trunc i32 %tmp to i16 ; <i16> [#uses=1]
+ ret i16 %tmp69
}
declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
@@ -46,41 +62,56 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
-
declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32)
+
declare <4 x float> @f()
define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: roundss $4, (%eax), %xmm0
+; CHECK-NEXT: retl
+;
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4)
ret <4 x float> %X
-; CHECK-LABEL: test3:
-; CHECK: roundss $4, (%eax), %xmm0
}
define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: subl $28, %esp
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movaps %xmm0, (%esp) ## 16-byte Spill
+; CHECK-NEXT: calll _f
+; CHECK-NEXT: movaps (%esp), %xmm1 ## 16-byte Reload
+; CHECK-NEXT: roundss $4, %xmm1, %xmm0
+; CHECK-NEXT: addl $28, %esp
+; CHECK-NEXT: retl
+;
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%q = call <4 x float> @f()
%X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %q, <4 x float> %B, i32 4)
ret <4 x float> %X
-; CHECK-LABEL: test4:
-; CHECK: movss (%eax), %xmm
-; CHECK: call
-; CHECK: roundss $4, %xmm{{.*}}, %xmm0
}
-; PR13576
+; PR13576
define <2 x double> @test5() nounwind uwtable readnone noinline {
+; CHECK-LABEL: test5:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
+; CHECK-NEXT: movl $128, %eax
+; CHECK-NEXT: cvtsi2sdl %eax, %xmm0
+; CHECK-NEXT: retl
+;
entry:
%0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double
4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
ret <2 x double> %0
-; CHECK-LABEL: test5:
-; CHECK: mov
-; CHECK: mov
-; CHECK: cvtsi2sd
}
declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index dfc186bef052..c0e02bd15996 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -65,7 +65,9 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
;
; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32:
; AVX512F: # BB#0:
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
@@ -142,7 +144,9 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
;
; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32:
; AVX512F: # BB#0:
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
+; AVX512F-NEXT: # kill
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32:
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
new file mode 100644
index 000000000000..5a443991c53f
--- /dev/null
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -0,0 +1,3772 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+
+define i8 @test_bitreverse_i8(i8 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: shlb $7, %al
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shlb $5, %cl
+; SSE-NEXT: andb $64, %cl
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shlb $3, %dl
+; SSE-NEXT: andb $32, %dl
+; SSE-NEXT: orb %cl, %dl
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: addb %cl, %cl
+; SSE-NEXT: andb $16, %cl
+; SSE-NEXT: orb %dl, %cl
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrb %dl
+; SSE-NEXT: andb $8, %dl
+; SSE-NEXT: orb %cl, %dl
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrb $3, %cl
+; SSE-NEXT: andb $4, %cl
+; SSE-NEXT: orb %dl, %cl
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrb $5, %dl
+; SSE-NEXT: andb $2, %dl
+; SSE-NEXT: orb %cl, %dl
+; SSE-NEXT: shrb $7, %dil
+; SSE-NEXT: orb %dl, %dil
+; SSE-NEXT: orb %al, %dil
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i8:
+; AVX: # BB#0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: shlb $7, %al
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shlb $5, %cl
+; AVX-NEXT: andb $64, %cl
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shlb $3, %dl
+; AVX-NEXT: andb $32, %dl
+; AVX-NEXT: orb %cl, %dl
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: addb %cl, %cl
+; AVX-NEXT: andb $16, %cl
+; AVX-NEXT: orb %dl, %cl
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrb %dl
+; AVX-NEXT: andb $8, %dl
+; AVX-NEXT: orb %cl, %dl
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrb $3, %cl
+; AVX-NEXT: andb $4, %cl
+; AVX-NEXT: orb %dl, %cl
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrb $5, %dl
+; AVX-NEXT: andb $2, %dl
+; AVX-NEXT: orb %cl, %dl
+; AVX-NEXT: shrb $7, %dil
+; AVX-NEXT: orb %dl, %dil
+; AVX-NEXT: orb %al, %dil
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i8:
+; XOP: # BB#0:
+; XOP-NEXT: vmovd %edi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vpextrb $0, %xmm0, %eax
+; XOP-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; XOP-NEXT: retq
+ %b = call i8 @llvm.bitreverse.i8(i8 %a)
+ ret i8 %b
+}
+
+define i16 @test_bitreverse_i16(i16 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i16:
+; SSE: # BB#0:
+; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: shll $15, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $2, %edx
+; SSE-NEXT: shll $13, %edx
+; SSE-NEXT: leal (%rdx,%rax), %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $4, %edx
+; SSE-NEXT: shll $11, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $8, %edx
+; SSE-NEXT: shll $9, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $16, %edx
+; SSE-NEXT: shll $7, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $32, %edx
+; SSE-NEXT: shll $5, %edx
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: andl $64, %edx
+; SSE-NEXT: shll $3, %edx
+; SSE-NEXT: leal (%rdi,%rdi), %esi
+; SSE-NEXT: andl $256, %esi # imm = 0x100
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl %edx
+; SSE-NEXT: andl $128, %edx
+; SSE-NEXT: orl %esi, %edx
+; SSE-NEXT: movl %edi, %esi
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $64, %esi
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $5, %edx
+; SSE-NEXT: andl $32, %edx
+; SSE-NEXT: orl %esi, %edx
+; SSE-NEXT: movl %edi, %esi
+; SSE-NEXT: shrl $7, %esi
+; SSE-NEXT: andl $16, %esi
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $9, %edx
+; SSE-NEXT: andl $8, %edx
+; SSE-NEXT: orl %esi, %edx
+; SSE-NEXT: movl %edi, %esi
+; SSE-NEXT: shrl $11, %esi
+; SSE-NEXT: andl $4, %esi
+; SSE-NEXT: orl %edx, %esi
+; SSE-NEXT: shrl $13, %edi
+; SSE-NEXT: andl $2, %edi
+; SSE-NEXT: orl %esi, %edi
+; SSE-NEXT: shrl $15, %ecx
+; SSE-NEXT: orl %edi, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i16:
+; AVX: # BB#0:
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: shll $15, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $2, %edx
+; AVX-NEXT: shll $13, %edx
+; AVX-NEXT: leal (%rdx,%rax), %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $4, %edx
+; AVX-NEXT: shll $11, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $8, %edx
+; AVX-NEXT: shll $9, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $16, %edx
+; AVX-NEXT: shll $7, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $32, %edx
+; AVX-NEXT: shll $5, %edx
+; AVX-NEXT: orl %edx, %eax
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: andl $64, %edx
+; AVX-NEXT: shll $3, %edx
+; AVX-NEXT: leal (%rdi,%rdi), %esi
+; AVX-NEXT: andl $256, %esi # imm = 0x100
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: andl $128, %edx
+; AVX-NEXT: orl %esi, %edx
+; AVX-NEXT: movl %edi, %esi
+; AVX-NEXT: shrl $3, %esi
+; AVX-NEXT: andl $64, %esi
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $5, %edx
+; AVX-NEXT: andl $32, %edx
+; AVX-NEXT: orl %esi, %edx
+; AVX-NEXT: movl %edi, %esi
+; AVX-NEXT: shrl $7, %esi
+; AVX-NEXT: andl $16, %esi
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $9, %edx
+; AVX-NEXT: andl $8, %edx
+; AVX-NEXT: orl %esi, %edx
+; AVX-NEXT: movl %edi, %esi
+; AVX-NEXT: shrl $11, %esi
+; AVX-NEXT: andl $4, %esi
+; AVX-NEXT: orl %edx, %esi
+; AVX-NEXT: shrl $13, %edi
+; AVX-NEXT: andl $2, %edi
+; AVX-NEXT: orl %esi, %edi
+; AVX-NEXT: shrl $15, %ecx
+; AVX-NEXT: orl %edi, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i16:
+; XOP: # BB#0:
+; XOP-NEXT: vmovd %edi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vmovd %xmm0, %eax
+; XOP-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; XOP-NEXT: retq
+ %b = call i16 @llvm.bitreverse.i16(i16 %a)
+ ret i16 %b
+}
+
+define i32 @test_bitreverse_i32(i32 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i32:
+; SSE: # BB#0:
+; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE-NEXT: movl %edi, %eax
+; SSE-NEXT: shll $31, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $2, %ecx
+; SSE-NEXT: shll $29, %ecx
+; SSE-NEXT: leal (%rcx,%rax), %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $4, %ecx
+; SSE-NEXT: shll $27, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $8, %ecx
+; SSE-NEXT: shll $25, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $16, %ecx
+; SSE-NEXT: shll $23, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $32, %ecx
+; SSE-NEXT: shll $21, %ecx
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: andl $64, %ecx
+; SSE-NEXT: shll $19, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $17, %edx
+; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shll $15, %ecx
+; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $13, %edx
+; SSE-NEXT: andl $4194304, %edx # imm = 0x400000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shll $11, %ecx
+; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $9, %edx
+; SSE-NEXT: andl $1048576, %edx # imm = 0x100000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shll $7, %ecx
+; SSE-NEXT: andl $524288, %ecx # imm = 0x80000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shll $5, %edx
+; SSE-NEXT: andl $262144, %edx # imm = 0x40000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: leal (,%rdi,8), %ecx
+; SSE-NEXT: andl $131072, %ecx # imm = 0x20000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: leal (%rdi,%rdi), %edx
+; SSE-NEXT: andl $65536, %edx # imm = 0x10000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl %ecx
+; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $3, %edx
+; SSE-NEXT: andl $16384, %edx # imm = 0x4000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $5, %ecx
+; SSE-NEXT: andl $8192, %ecx # imm = 0x2000
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $7, %edx
+; SSE-NEXT: andl $4096, %edx # imm = 0x1000
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $9, %ecx
+; SSE-NEXT: andl $2048, %ecx # imm = 0x800
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $11, %edx
+; SSE-NEXT: andl $1024, %edx # imm = 0x400
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $13, %ecx
+; SSE-NEXT: andl $512, %ecx # imm = 0x200
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $15, %edx
+; SSE-NEXT: andl $256, %edx # imm = 0x100
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $17, %ecx
+; SSE-NEXT: andl $128, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $19, %edx
+; SSE-NEXT: andl $64, %edx
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $21, %ecx
+; SSE-NEXT: andl $32, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $23, %edx
+; SSE-NEXT: andl $16, %edx
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $25, %ecx
+; SSE-NEXT: andl $8, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: movl %edi, %edx
+; SSE-NEXT: shrl $27, %edx
+; SSE-NEXT: andl $4, %edx
+; SSE-NEXT: orl %ecx, %edx
+; SSE-NEXT: movl %edi, %ecx
+; SSE-NEXT: shrl $29, %ecx
+; SSE-NEXT: andl $2, %ecx
+; SSE-NEXT: orl %edx, %ecx
+; SSE-NEXT: shrl $31, %edi
+; SSE-NEXT: orl %ecx, %edi
+; SSE-NEXT: orl %edi, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i32:
+; AVX: # BB#0:
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: shll $31, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $2, %ecx
+; AVX-NEXT: shll $29, %ecx
+; AVX-NEXT: leal (%rcx,%rax), %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $4, %ecx
+; AVX-NEXT: shll $27, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $8, %ecx
+; AVX-NEXT: shll $25, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $16, %ecx
+; AVX-NEXT: shll $23, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $32, %ecx
+; AVX-NEXT: shll $21, %ecx
+; AVX-NEXT: orl %ecx, %eax
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: andl $64, %ecx
+; AVX-NEXT: shll $19, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $17, %edx
+; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shll $15, %ecx
+; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $13, %edx
+; AVX-NEXT: andl $4194304, %edx # imm = 0x400000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shll $11, %ecx
+; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $9, %edx
+; AVX-NEXT: andl $1048576, %edx # imm = 0x100000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shll $7, %ecx
+; AVX-NEXT: andl $524288, %ecx # imm = 0x80000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shll $5, %edx
+; AVX-NEXT: andl $262144, %edx # imm = 0x40000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: leal (,%rdi,8), %ecx
+; AVX-NEXT: andl $131072, %ecx # imm = 0x20000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: leal (%rdi,%rdi), %edx
+; AVX-NEXT: andl $65536, %edx # imm = 0x10000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $3, %edx
+; AVX-NEXT: andl $16384, %edx # imm = 0x4000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $5, %ecx
+; AVX-NEXT: andl $8192, %ecx # imm = 0x2000
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $7, %edx
+; AVX-NEXT: andl $4096, %edx # imm = 0x1000
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $9, %ecx
+; AVX-NEXT: andl $2048, %ecx # imm = 0x800
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $11, %edx
+; AVX-NEXT: andl $1024, %edx # imm = 0x400
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $13, %ecx
+; AVX-NEXT: andl $512, %ecx # imm = 0x200
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $15, %edx
+; AVX-NEXT: andl $256, %edx # imm = 0x100
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $17, %ecx
+; AVX-NEXT: andl $128, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $19, %edx
+; AVX-NEXT: andl $64, %edx
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $21, %ecx
+; AVX-NEXT: andl $32, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $23, %edx
+; AVX-NEXT: andl $16, %edx
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $25, %ecx
+; AVX-NEXT: andl $8, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: movl %edi, %edx
+; AVX-NEXT: shrl $27, %edx
+; AVX-NEXT: andl $4, %edx
+; AVX-NEXT: orl %ecx, %edx
+; AVX-NEXT: movl %edi, %ecx
+; AVX-NEXT: shrl $29, %ecx
+; AVX-NEXT: andl $2, %ecx
+; AVX-NEXT: orl %edx, %ecx
+; AVX-NEXT: shrl $31, %edi
+; AVX-NEXT: orl %ecx, %edi
+; AVX-NEXT: orl %edi, %eax
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i32:
+; XOP: # BB#0:
+; XOP-NEXT: vmovd %edi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vmovd %xmm0, %eax
+; XOP-NEXT: retq
+ %b = call i32 @llvm.bitreverse.i32(i32 %a)
+ ret i32 %b
+}
+
+define i64 @test_bitreverse_i64(i64 %a) nounwind {
+; SSE-LABEL: test_bitreverse_i64:
+; SSE: # BB#0:
+; SSE-NEXT: leaq (%rdi,%rdi), %rax
+; SSE-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
+; SSE-NEXT: andq %rax, %rcx
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: shlq $63, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $2, %rdx
+; SSE-NEXT: shlq $61, %rdx
+; SSE-NEXT: leaq (%rdx,%rax), %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $4, %rdx
+; SSE-NEXT: shlq $59, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $8, %rdx
+; SSE-NEXT: shlq $57, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $16, %rdx
+; SSE-NEXT: shlq $55, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $32, %rdx
+; SSE-NEXT: shlq $53, %rdx
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $64, %rdx
+; SSE-NEXT: shlq $51, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $128, %rsi
+; SSE-NEXT: shlq $49, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $256, %rdx # imm = 0x100
+; SSE-NEXT: shlq $47, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $512, %rsi # imm = 0x200
+; SSE-NEXT: shlq $45, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $1024, %rdx # imm = 0x400
+; SSE-NEXT: shlq $43, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $2048, %rsi # imm = 0x800
+; SSE-NEXT: shlq $41, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $4096, %rdx # imm = 0x1000
+; SSE-NEXT: shlq $39, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $8192, %rsi # imm = 0x2000
+; SSE-NEXT: shlq $37, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $16384, %rdx # imm = 0x4000
+; SSE-NEXT: shlq $35, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $32768, %rsi # imm = 0x8000
+; SSE-NEXT: shlq $33, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $65536, %rdx # imm = 0x10000
+; SSE-NEXT: shlq $31, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $131072, %rsi # imm = 0x20000
+; SSE-NEXT: shlq $29, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $262144, %rdx # imm = 0x40000
+; SSE-NEXT: shlq $27, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $524288, %rsi # imm = 0x80000
+; SSE-NEXT: shlq $25, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $1048576, %rdx # imm = 0x100000
+; SSE-NEXT: shlq $23, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $2097152, %rsi # imm = 0x200000
+; SSE-NEXT: shlq $21, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $4194304, %rdx # imm = 0x400000
+; SSE-NEXT: shlq $19, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $8388608, %rsi # imm = 0x800000
+; SSE-NEXT: shlq $17, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $16777216, %rdx # imm = 0x1000000
+; SSE-NEXT: shlq $15, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $33554432, %rsi # imm = 0x2000000
+; SSE-NEXT: shlq $13, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $67108864, %rdx # imm = 0x4000000
+; SSE-NEXT: shlq $11, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $134217728, %rsi # imm = 0x8000000
+; SSE-NEXT: shlq $9, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $268435456, %rdx # imm = 0x10000000
+; SSE-NEXT: shlq $7, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: movq %rdi, %rsi
+; SSE-NEXT: andq $536870912, %rsi # imm = 0x20000000
+; SSE-NEXT: shlq $5, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: andq $1073741824, %rdx # imm = 0x40000000
+; SSE-NEXT: shlq $3, %rdx
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $3, %rdx
+; SSE-NEXT: andl $1073741824, %edx # imm = 0x40000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $5, %rcx
+; SSE-NEXT: andl $536870912, %ecx # imm = 0x20000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $7, %rdx
+; SSE-NEXT: andl $268435456, %edx # imm = 0x10000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $9, %rcx
+; SSE-NEXT: andl $134217728, %ecx # imm = 0x8000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $11, %rdx
+; SSE-NEXT: andl $67108864, %edx # imm = 0x4000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $13, %rcx
+; SSE-NEXT: andl $33554432, %ecx # imm = 0x2000000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $15, %rdx
+; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $17, %rcx
+; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $19, %rdx
+; SSE-NEXT: andl $4194304, %edx # imm = 0x400000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $21, %rcx
+; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $23, %rdx
+; SSE-NEXT: andl $1048576, %edx # imm = 0x100000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $25, %rcx
+; SSE-NEXT: andl $524288, %ecx # imm = 0x80000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $27, %rdx
+; SSE-NEXT: andl $262144, %edx # imm = 0x40000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $29, %rcx
+; SSE-NEXT: andl $131072, %ecx # imm = 0x20000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $31, %rdx
+; SSE-NEXT: andl $65536, %edx # imm = 0x10000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $33, %rcx
+; SSE-NEXT: andl $32768, %ecx # imm = 0x8000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $35, %rdx
+; SSE-NEXT: andl $16384, %edx # imm = 0x4000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $37, %rcx
+; SSE-NEXT: andl $8192, %ecx # imm = 0x2000
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $39, %rdx
+; SSE-NEXT: andl $4096, %edx # imm = 0x1000
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $41, %rcx
+; SSE-NEXT: andl $2048, %ecx # imm = 0x800
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $43, %rdx
+; SSE-NEXT: andl $1024, %edx # imm = 0x400
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $45, %rcx
+; SSE-NEXT: andl $512, %ecx # imm = 0x200
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $47, %rdx
+; SSE-NEXT: andl $256, %edx # imm = 0x100
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $49, %rcx
+; SSE-NEXT: andl $128, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $51, %rdx
+; SSE-NEXT: andl $64, %edx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $53, %rcx
+; SSE-NEXT: andl $32, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $55, %rdx
+; SSE-NEXT: andl $16, %edx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $57, %rcx
+; SSE-NEXT: andl $8, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: movq %rdi, %rdx
+; SSE-NEXT: shrq $59, %rdx
+; SSE-NEXT: andl $4, %edx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq %rdi, %rcx
+; SSE-NEXT: shrq $61, %rcx
+; SSE-NEXT: andl $2, %ecx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: shrq $63, %rdi
+; SSE-NEXT: orq %rcx, %rdi
+; SSE-NEXT: orq %rdi, %rax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_i64:
+; AVX: # BB#0:
+; AVX-NEXT: leaq (%rdi,%rdi), %rax
+; AVX-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
+; AVX-NEXT: andq %rax, %rcx
+; AVX-NEXT: movq %rdi, %rax
+; AVX-NEXT: shlq $63, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $2, %rdx
+; AVX-NEXT: shlq $61, %rdx
+; AVX-NEXT: leaq (%rdx,%rax), %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $4, %rdx
+; AVX-NEXT: shlq $59, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $8, %rdx
+; AVX-NEXT: shlq $57, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $16, %rdx
+; AVX-NEXT: shlq $55, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $32, %rdx
+; AVX-NEXT: shlq $53, %rdx
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $64, %rdx
+; AVX-NEXT: shlq $51, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $128, %rsi
+; AVX-NEXT: shlq $49, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $256, %rdx # imm = 0x100
+; AVX-NEXT: shlq $47, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $512, %rsi # imm = 0x200
+; AVX-NEXT: shlq $45, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $1024, %rdx # imm = 0x400
+; AVX-NEXT: shlq $43, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $2048, %rsi # imm = 0x800
+; AVX-NEXT: shlq $41, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $4096, %rdx # imm = 0x1000
+; AVX-NEXT: shlq $39, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $8192, %rsi # imm = 0x2000
+; AVX-NEXT: shlq $37, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $16384, %rdx # imm = 0x4000
+; AVX-NEXT: shlq $35, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $32768, %rsi # imm = 0x8000
+; AVX-NEXT: shlq $33, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $65536, %rdx # imm = 0x10000
+; AVX-NEXT: shlq $31, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $131072, %rsi # imm = 0x20000
+; AVX-NEXT: shlq $29, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $262144, %rdx # imm = 0x40000
+; AVX-NEXT: shlq $27, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $524288, %rsi # imm = 0x80000
+; AVX-NEXT: shlq $25, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $1048576, %rdx # imm = 0x100000
+; AVX-NEXT: shlq $23, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $2097152, %rsi # imm = 0x200000
+; AVX-NEXT: shlq $21, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $4194304, %rdx # imm = 0x400000
+; AVX-NEXT: shlq $19, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $8388608, %rsi # imm = 0x800000
+; AVX-NEXT: shlq $17, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $16777216, %rdx # imm = 0x1000000
+; AVX-NEXT: shlq $15, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $33554432, %rsi # imm = 0x2000000
+; AVX-NEXT: shlq $13, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $67108864, %rdx # imm = 0x4000000
+; AVX-NEXT: shlq $11, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $134217728, %rsi # imm = 0x8000000
+; AVX-NEXT: shlq $9, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $268435456, %rdx # imm = 0x10000000
+; AVX-NEXT: shlq $7, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: movq %rdi, %rsi
+; AVX-NEXT: andq $536870912, %rsi # imm = 0x20000000
+; AVX-NEXT: shlq $5, %rsi
+; AVX-NEXT: orq %rdx, %rsi
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: andq $1073741824, %rdx # imm = 0x40000000
+; AVX-NEXT: shlq $3, %rdx
+; AVX-NEXT: orq %rsi, %rdx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: andl $-2147483648, %ecx # imm = 0x80000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $3, %rdx
+; AVX-NEXT: andl $1073741824, %edx # imm = 0x40000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $5, %rcx
+; AVX-NEXT: andl $536870912, %ecx # imm = 0x20000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $7, %rdx
+; AVX-NEXT: andl $268435456, %edx # imm = 0x10000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $9, %rcx
+; AVX-NEXT: andl $134217728, %ecx # imm = 0x8000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $11, %rdx
+; AVX-NEXT: andl $67108864, %edx # imm = 0x4000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $13, %rcx
+; AVX-NEXT: andl $33554432, %ecx # imm = 0x2000000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $15, %rdx
+; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $17, %rcx
+; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $19, %rdx
+; AVX-NEXT: andl $4194304, %edx # imm = 0x400000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $21, %rcx
+; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $23, %rdx
+; AVX-NEXT: andl $1048576, %edx # imm = 0x100000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $25, %rcx
+; AVX-NEXT: andl $524288, %ecx # imm = 0x80000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $27, %rdx
+; AVX-NEXT: andl $262144, %edx # imm = 0x40000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $29, %rcx
+; AVX-NEXT: andl $131072, %ecx # imm = 0x20000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $31, %rdx
+; AVX-NEXT: andl $65536, %edx # imm = 0x10000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $33, %rcx
+; AVX-NEXT: andl $32768, %ecx # imm = 0x8000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $35, %rdx
+; AVX-NEXT: andl $16384, %edx # imm = 0x4000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $37, %rcx
+; AVX-NEXT: andl $8192, %ecx # imm = 0x2000
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $39, %rdx
+; AVX-NEXT: andl $4096, %edx # imm = 0x1000
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $41, %rcx
+; AVX-NEXT: andl $2048, %ecx # imm = 0x800
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $43, %rdx
+; AVX-NEXT: andl $1024, %edx # imm = 0x400
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $45, %rcx
+; AVX-NEXT: andl $512, %ecx # imm = 0x200
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $47, %rdx
+; AVX-NEXT: andl $256, %edx # imm = 0x100
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $49, %rcx
+; AVX-NEXT: andl $128, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $51, %rdx
+; AVX-NEXT: andl $64, %edx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $53, %rcx
+; AVX-NEXT: andl $32, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $55, %rdx
+; AVX-NEXT: andl $16, %edx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $57, %rcx
+; AVX-NEXT: andl $8, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: movq %rdi, %rdx
+; AVX-NEXT: shrq $59, %rdx
+; AVX-NEXT: andl $4, %edx
+; AVX-NEXT: orq %rcx, %rdx
+; AVX-NEXT: movq %rdi, %rcx
+; AVX-NEXT: shrq $61, %rcx
+; AVX-NEXT: andl $2, %ecx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: shrq $63, %rdi
+; AVX-NEXT: orq %rcx, %rdi
+; AVX-NEXT: orq %rdi, %rax
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_i64:
+; XOP: # BB#0:
+; XOP-NEXT: vmovq %rdi, %xmm0
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: vmovq %xmm0, %rax
+; XOP-NEXT: retq
+ %b = call i64 @llvm.bitreverse.i64(i64 %a)
+ ret i64 %b
+}
+
+define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm1, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $7, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm3, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $3, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: paddb %xmm3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrlw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v16i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v16i8:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+ ret <16 x i8> %b
+}
+
+define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v8i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm2, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v8i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v8i16:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+ ret <8 x i16> %b
+}
+
+define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm2, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v4i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v4i32:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+ ret <4 x i32> %b
+}
+
+define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm2, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psllw $5, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: paddb %xmm0, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $3, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrlw $5, %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm0, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v2i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm1, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test_bitreverse_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; XOP-LABEL: test_bitreverse_v2i64:
+; XOP: # BB#0:
+; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; XOP-NEXT: retq
+ %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+ ret <2 x i64> %b
+}
+
+define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v32i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $7, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm10, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm3, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $7, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: por %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v32i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm6, %xmm3
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm5
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v32i8:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v32i8:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+ ret <32 x i8> %b
+}
+
+define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v16i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm13, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v16i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v16i16:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v16i16:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+ ret <16 x i16> %b
+}
+
+define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psllw $7, %xmm3
+; SSE2-NEXT: pand %xmm11, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm13, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v8i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v8i32:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v8i32:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+ ret <8 x i32> %b
+}
+
+define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllw $5, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psllw $3, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm3
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $5, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm3, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $3, %xmm7
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrlw $1, %xmm7
+; SSE2-NEXT: pand %xmm13, %xmm7
+; SSE2-NEXT: por %xmm5, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm2, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v4i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm6, %xmm7
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb %xmm0, %xmm3
+; SSSE3-NEXT: por %xmm7, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm2
+; SSSE3-NEXT: por %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_bitreverse_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v4i64:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v4i64:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+ ret <4 x i64> %b
+}
+
+define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v64i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm9
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm10, %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlw $5, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm4
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm6, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: por %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: por %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: por %xmm5, %xmm2
+; SSE2-NEXT: por %xmm7, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $5, %xmm4
+; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psllw $7, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $3, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: paddb %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlw $3, %xmm4
+; SSE2-NEXT: pand %xmm13, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: por %xmm5, %xmm3
+; SSE2-NEXT: por %xmm7, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v64i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: pand %xmm8, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm5, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm9, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm1, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: movdqa %xmm9, %xmm7
+; SSSE3-NEXT: pshufb %xmm1, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pshufb %xmm2, %xmm6
+; SSSE3-NEXT: por %xmm7, %xmm6
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: pshufb %xmm1, %xmm9
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm8, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm9, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm6, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v64i8:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v64i8:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+ ret <64 x i8> %b
+}
+
+define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v32i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
+; SSE2-NEXT: pand %xmm15, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v32i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; SSSE3-NEXT: pshufb %xmm8, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pshufb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: pshufb %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pshufb %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v32i16:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v32i16:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+ ret <32 x i16> %b
+}
+
+define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v16i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
+; SSE2-NEXT: pand %xmm15, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v16i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT: pshufb %xmm8, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pshufb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: pshufb %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pshufb %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v16i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpslld $29, %zmm0, %zmm1
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm1
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512F-NEXT: vpslld $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpslld $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $29, %zmm0, %zmm3
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v16i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v16i32:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v16i32:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+ ret <16 x i32> %b
+}
+
+define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
+; SSE2-LABEL: test_bitreverse_v8i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm10
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; SSE2-NEXT: pand %xmm11, %xmm11
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm12
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm13
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrlw $3, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm14
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlw $5, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm15
+; SSE2-NEXT: pand %xmm15, %xmm5
+; SSE2-NEXT: por %xmm7, %xmm5
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-NEXT: pand %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: por %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: por %xmm6, %xmm2
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psllw $5, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllw $7, %xmm4
+; SSE2-NEXT: pand %xmm11, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psllw $3, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: paddb %xmm5, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: pand %xmm13, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlw $3, %xmm5
+; SSE2-NEXT: pand %xmm14, %xmm5
+; SSE2-NEXT: por %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrlw $5, %xmm6
+; SSE2-NEXT: pand %xmm15, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: psrlw $7, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: por %xmm6, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test_bitreverse_v8i64:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; SSSE3-NEXT: pshufb %xmm8, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pand %xmm9, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm0, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pshufb %xmm8, %xmm5
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: pshufb %xmm8, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm9, %xmm5
+; SSSE3-NEXT: movdqa %xmm7, %xmm6
+; SSSE3-NEXT: pshufb %xmm5, %xmm6
+; SSSE3-NEXT: psrlw $4, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pshufb %xmm2, %xmm5
+; SSSE3-NEXT: por %xmm6, %xmm5
+; SSSE3-NEXT: pshufb %xmm8, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm9, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm7
+; SSSE3-NEXT: psrlw $4, %xmm3
+; SSSE3-NEXT: pand %xmm9, %xmm3
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm7, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm2
+; SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test_bitreverse_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_bitreverse_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_bitreverse_v8i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsllq $61, %zmm0, %zmm1
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm2
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq $59, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $57, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $55, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $53, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $51, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $49, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $47, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $45, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $43, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $41, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $39, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $37, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $35, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $33, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $31, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $29, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $5, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $7, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $9, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $11, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $13, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $15, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $17, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $19, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $21, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $23, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $25, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $27, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $29, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $31, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $33, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $35, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $37, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $39, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $41, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $43, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $45, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $47, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $49, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $51, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $53, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $55, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $57, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $59, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $61, %zmm0, %zmm3
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_bitreverse_v8i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; XOPAVX1-LABEL: test_bitreverse_v8i64:
+; XOPAVX1: # BB#0:
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: test_bitreverse_v8i64:
+; XOPAVX2: # BB#0:
+; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
+; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
+; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; XOPAVX2-NEXT: retq
+ %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+ ret <8 x i64> %b
+}
+
+declare i8 @llvm.bitreverse.i8(i8) readnone
+declare i16 @llvm.bitreverse.i16(i16) readnone
+declare i32 @llvm.bitreverse.i32(i32) readnone
+declare i64 @llvm.bitreverse.i64(i64) readnone
+
+declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
+declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
+declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
+declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
+
+declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone
+declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
+declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone
+declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone
+
+declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone
+declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
+declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone
+declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index aaf81f2f9bb6..309fa98145c6 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
@@ -272,15 +273,15 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
; SSE41-LABEL: vsel_i8:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; SSE41-NEXT: pblendvb %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_i8:
; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
entry:
%vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
@@ -632,8 +633,8 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSE2-NEXT: andps %xmm4, %xmm3
; SSE2-NEXT: andnps %xmm1, %xmm4
; SSE2-NEXT: orps %xmm3, %xmm4
-; SSE2-NEXT: movaps %xmm5, %xmm0
-; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: constant_pblendvb_avx2:
@@ -651,20 +652,19 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSE41-LABEL: constant_pblendvb_avx2:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; SSE41-NEXT: pblendvb %xmm2, %xmm4
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT: pblendvb %xmm4, %xmm2
+; SSE41-NEXT: pblendvb %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_pblendvb_avx2:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa .LCPI18_0(%rip), %xmm4 # xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_pblendvb_avx2:
@@ -801,3 +801,254 @@ entry:
%select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
ret <4 x i64> %select
}
+
+define <4 x i32> @blend_logic_v4i32(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) {
+; SSE2-LABEL: blend_logic_v4i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_logic_v4i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: pand %xmm0, %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_logic_v4i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendvb %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_logic_v4i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+; AVX-NEXT: retq
+entry:
+ %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <4 x i32> zeroinitializer, %a
+ %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <4 x i32> %c, %0
+ %2 = and <4 x i32> %a, %b.lobit
+ %cond = or <4 x i32> %1, %2
+ ret <4 x i32> %cond
+}
+
+define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
+; SSE2-LABEL: blend_logic_v8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pandn %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_logic_v8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm3
+; SSSE3-NEXT: pandn %xmm5, %xmm1
+; SSSE3-NEXT: pand %xmm0, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm0
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: por %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_logic_v8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: psrad $31, %xmm0
+; SSE41-NEXT: pblendvb %xmm2, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm3, %xmm5
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: blend_logic_v8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm2
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_logic_v8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <8 x i32> zeroinitializer, %a
+ %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <8 x i32> %c, %0
+ %2 = and <8 x i32> %a, %b.lobit
+ %cond = or <8 x i32> %1, %2
+ ret <8 x i32> %cond
+}
+
+define <4 x i32> @blend_neg_logic_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: blend_neg_logic_v4i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_neg_logic_v4i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_neg_logic_v4i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_neg_logic_v4i32:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %b.lobit = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <4 x i32> zeroinitializer, %a
+ %0 = xor <4 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <4 x i32> %a, %0
+ %2 = and <4 x i32> %b.lobit, %sub
+ %cond = or <4 x i32> %1, %2
+ ret <4 x i32> %cond
+}
+
+define <8 x i32> @blend_neg_logic_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: blend_neg_logic_v8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: psubd %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_neg_logic_v8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: psubd %xmm3, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_neg_logic_v8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm3, %xmm1
+; SSE41-NEXT: psubd %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: blend_neg_logic_v8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: blend_neg_logic_v8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+entry:
+ %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ %sub = sub nsw <8 x i32> zeroinitializer, %a
+ %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = and <8 x i32> %a, %0
+ %2 = and <8 x i32> %b.lobit, %sub
+ %cond = or <8 x i32> %1, %2
+ ret <8 x i32> %cond
+}
+
+define <4 x i32> @blend_neg_logic_v4i32_2(<4 x i32> %v, <4 x i32> %c) {
+; SSE2-LABEL: blend_neg_logic_v4i32_2:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrld $31, %xmm1
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: blend_neg_logic_v4i32_2:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrld $31, %xmm1
+; SSSE3-NEXT: pslld $31, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pxor %xmm1, %xmm0
+; SSSE3-NEXT: psubd %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: blend_neg_logic_v4i32_2:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrld $31, %xmm1
+; SSE41-NEXT: pslld $31, %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm3
+; SSE41-NEXT: psubd %xmm2, %xmm3
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: blendvps %xmm2, %xmm3
+; SSE41-NEXT: movaps %xmm3, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_neg_logic_v4i32_2:
+; AVX: # BB#0: # %entry
+; AVX-NEXT: vpsrld $31, %xmm1, %xmm1
+; AVX-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm2
+; AVX-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = ashr <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
+ %1 = trunc <4 x i32> %0 to <4 x i1>
+ %2 = sub nsw <4 x i32> zeroinitializer, %v
+ %3 = select <4 x i1> %1, <4 x i32> %v, <4 x i32> %2
+ ret <4 x i32> %3
+}
diff --git a/test/CodeGen/X86/vector-compare-combines.ll b/test/CodeGen/X86/vector-compare-combines.ll
new file mode 100644
index 000000000000..c25474d92f9c
--- /dev/null
+++ b/test/CodeGen/X86/vector-compare-combines.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+
+; If we have SSE/AVX intrinsics in the code, we miss obvious combines
+; unless we do them late on X86-specific nodes.
+
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @PR27924_cmpeq(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: PR27924_cmpeq:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR27924_cmpeq:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <4 x i32> %a, %b
+ %max = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+ %sse_max = tail call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a, <4 x i32> %b)
+ %truth = icmp eq <4 x i32> %max, %sse_max
+ %ret = sext <4 x i1> %truth to <4 x i32>
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @PR27924_cmpgt(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: PR27924_cmpgt:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR27924_cmpgt:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <4 x i32> %a, %b
+ %max = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
+ %sse_max = tail call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a, <4 x i32> %b)
+ %untruth = icmp sgt <4 x i32> %max, %sse_max
+ %ret = sext <4 x i1> %untruth to <4 x i32>
+ ret <4 x i32> %ret
+}
+
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
new file mode 100644
index 000000000000..595d3a42b76f
--- /dev/null
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -0,0 +1,6625 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+
+;
+; 128-bit vector comparisons
+;
+
+define <2 x i1> @test_cmp_v2f64(<2 x double> %a0, <2 x double> %a1) nounwind {
+; SSE-LABEL: test_cmp_v2f64:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltpd %xmm0, %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fcmp ogt <2 x double> %a0, %a1
+ ret <2 x i1> %1
+}
+
+define <4 x i1> @test_cmp_v4f32(<4 x float> %a0, <4 x float> %a1) nounwind {
+; SSE-LABEL: test_cmp_v4f32:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v4f32:
+; AVX: # BB#0:
+; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %1 = fcmp ogt <4 x float> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <2 x i1> @test_cmp_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v2i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <2 x i64> %a0, %a1
+ ret <2 x i1> %1
+}
+
+define <4 x i1> @test_cmp_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
+; SSE-LABEL: test_cmp_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <4 x i32> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
+; SSE-LABEL: test_cmp_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <8 x i16> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_cmp_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = icmp sgt <16 x i8> %a0, %a1
+ ret <16 x i1> %1
+}
+
+;
+; 256-bit vector comparisons
+;
+
+define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v4f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltpd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: cmpltpd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v4f64:
+; SSE42: # BB#0:
+; SSE42-NEXT: cmpltpd %xmm1, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,2]
+; SSE42-NEXT: cmpltpd %xmm0, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v4f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v4f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <4 x double> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltps %xmm1, %xmm3
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: cmpltps %xmm0, %xmm2
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8f32:
+; SSE42: # BB#0:
+; SSE42-NEXT: cmpltps %xmm1, %xmm3
+; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm1, %xmm3
+; SSE42-NEXT: cmpltps %xmm0, %xmm2
+; SSE42-NEXT: pshufb %xmm1, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <8 x float> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v4i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm3, %xmm1
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = icmp sgt <4 x i64> %a0, %a1
+ ret <4 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8i32:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm3, %xmm1
+; SSE42-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE42-NEXT: pshufb %xmm3, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = icmp sgt <8 x i32> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v16i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v16i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm3, %xmm1
+; SSE42-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE42-NEXT: pshufb %xmm3, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <16 x i16> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE42-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE42-NEXT: pextrb $15, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <32 x i8> %a0, %a1
+ ret <32 x i1> %1
+}
+
+;
+; 512-bit vector comparisons
+;
+
+define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: cmpltpd %xmm3, %xmm7
+; SSE2-NEXT: cmpltpd %xmm1, %xmm5
+; SSE2-NEXT: pextrw $4, %xmm5, %eax
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; SSE2-NEXT: cmpltpd %xmm2, %xmm6
+; SSE2-NEXT: cmpltpd %xmm0, %xmm4
+; SSE2-NEXT: pextrw $4, %xmm4, %ecx
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE2-NEXT: pextrw $4, %xmm7, %edx
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: pextrw $4, %xmm6, %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8f64:
+; SSE42: # BB#0:
+; SSE42-NEXT: cmpltpd %xmm3, %xmm7
+; SSE42-NEXT: xorpd %xmm3, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm3[1,2,3],xmm7[4],xmm3[5,6,7]
+; SSE42-NEXT: cmpltpd %xmm2, %xmm6
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1,2,3],xmm6[4],xmm3[5,6,7]
+; SSE42-NEXT: packusdw %xmm7, %xmm6
+; SSE42-NEXT: cmpltpd %xmm1, %xmm5
+; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0],xmm3[1,2,3],xmm5[4],xmm3[5,6,7]
+; SSE42-NEXT: cmpltpd %xmm0, %xmm4
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; SSE42-NEXT: packusdw %xmm5, %xmm3
+; SSE42-NEXT: packusdw %xmm6, %xmm3
+; SSE42-NEXT: movdqa %xmm3, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <8 x double> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16f32:
+; SSE: # BB#0:
+; SSE-NEXT: cmpltps %xmm3, %xmm7
+; SSE-NEXT: movaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: andps %xmm3, %xmm7
+; SSE-NEXT: cmpltps %xmm2, %xmm6
+; SSE-NEXT: andps %xmm3, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: cmpltps %xmm1, %xmm5
+; SSE-NEXT: andps %xmm3, %xmm5
+; SSE-NEXT: cmpltps %xmm0, %xmm4
+; SSE-NEXT: andps %xmm4, %xmm3
+; SSE-NEXT: packuswb %xmm5, %xmm3
+; SSE-NEXT: packuswb %xmm6, %xmm3
+; SSE-NEXT: movdqa %xmm3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vmovaps {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <16 x float> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v8i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm9
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pextrw $4, %xmm1, %eax
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: por %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pextrw $4, %xmm9, %ecx
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: pextrw $4, %xmm0, %ecx
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: pextrw $4, %xmm3, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v8i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm7, %xmm3
+; SSE42-NEXT: pxor %xmm7, %xmm7
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1,2,3],xmm3[4],xmm7[5,6,7]
+; SSE42-NEXT: pcmpgtq %xmm6, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1,2,3],xmm2[4],xmm7[5,6,7]
+; SSE42-NEXT: packusdw %xmm3, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm5, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm7[1,2,3],xmm1[4],xmm7[5,6,7]
+; SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7]
+; SSE42-NEXT: packusdw %xmm1, %xmm0
+; SSE42-NEXT: packusdw %xmm2, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v8i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v8i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v8i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <8 x i64> %a0, %a1
+ ret <8 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pcmpgtd %xmm6, %xmm2
+; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <16 x i32> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtw %xmm5, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm5, %xmm1
+; SSE42-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE42-NEXT: pshufb %xmm5, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE42-NEXT: pshufb %xmm5, %xmm3
+; SSE42-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE42-NEXT: pshufb %xmm5, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <32 x i16> %a0, %a1
+ ret <32 x i1> %1
+}
+
+define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v64i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE2-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v64i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtb %xmm4, %xmm0
+; SSE42-NEXT: pcmpgtb %xmm5, %xmm1
+; SSE42-NEXT: pcmpgtb %xmm6, %xmm2
+; SSE42-NEXT: pcmpgtb %xmm7, %xmm3
+; SSE42-NEXT: pextrb $15, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v64i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v64i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v64i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm3
+; AVX512-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vpcmpgtb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm2
+; AVX512-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpgtb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX512-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT: retq
+ %1 = icmp sgt <64 x i8> %a0, %a1
+ ret <64 x i1> %1
+}
+
+;
+; 1024-bit vector comparisons
+;
+
+define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind {
+; SSE-LABEL: test_cmp_v16f64:
+; SSE: # BB#0:
+; SSE-NEXT: movapd %xmm0, %xmm8
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15
+; SSE-NEXT: cmpltpd %xmm7, %xmm15
+; SSE-NEXT: movapd {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: andpd %xmm7, %xmm15
+; SSE-NEXT: cmpltpd %xmm6, %xmm14
+; SSE-NEXT: andpd %xmm7, %xmm14
+; SSE-NEXT: packuswb %xmm15, %xmm14
+; SSE-NEXT: cmpltpd %xmm5, %xmm13
+; SSE-NEXT: andpd %xmm7, %xmm13
+; SSE-NEXT: cmpltpd %xmm4, %xmm9
+; SSE-NEXT: andpd %xmm7, %xmm9
+; SSE-NEXT: packuswb %xmm13, %xmm9
+; SSE-NEXT: packuswb %xmm14, %xmm9
+; SSE-NEXT: cmpltpd %xmm3, %xmm12
+; SSE-NEXT: andpd %xmm7, %xmm12
+; SSE-NEXT: cmpltpd %xmm2, %xmm10
+; SSE-NEXT: andpd %xmm7, %xmm10
+; SSE-NEXT: packuswb %xmm12, %xmm10
+; SSE-NEXT: cmpltpd %xmm1, %xmm11
+; SSE-NEXT: andpd %xmm7, %xmm11
+; SSE-NEXT: cmpltpd %xmm8, %xmm0
+; SSE-NEXT: andpd %xmm7, %xmm0
+; SSE-NEXT: packuswb %xmm11, %xmm0
+; SSE-NEXT: packuswb %xmm10, %xmm0
+; SSE-NEXT: packuswb %xmm9, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandpd %xmm7, %xmm8, %xmm8
+; AVX1-NEXT: vandpd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vandpd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vandpd %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandpd %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2
+; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movq $-1, %rcx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512-NEXT: vucomisd %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512-NEXT: vucomisd %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vucomisd %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT: vucomisd %xmm2, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vucomisd %xmm2, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT: vucomisd %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT: vucomisd %xmm3, %xmm1
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovaq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vucomisd %xmm3, %xmm1
+; AVX512-NEXT: cmovaq %rcx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <16 x double> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE2-NEXT: cmpltps %xmm3, %xmm15
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm2, %xmm14
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: cmpltps %xmm1, %xmm13
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm0, %xmm12
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: cmpltps %xmm7, %xmm11
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm6, %xmm10
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: cmpltps %xmm5, %xmm9
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm4, %xmm8
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: packuswb %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32f32:
+; SSE42: # BB#0:
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE42-NEXT: cmpltps %xmm3, %xmm15
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm3, %xmm15
+; SSE42-NEXT: cmpltps %xmm2, %xmm13
+; SSE42-NEXT: pshufb %xmm3, %xmm13
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0]
+; SSE42-NEXT: psllw $15, %xmm13
+; SSE42-NEXT: psraw $15, %xmm13
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm2, %xmm13
+; SSE42-NEXT: cmpltps %xmm1, %xmm14
+; SSE42-NEXT: pshufb %xmm3, %xmm14
+; SSE42-NEXT: cmpltps %xmm0, %xmm8
+; SSE42-NEXT: pshufb %xmm3, %xmm8
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm14[0]
+; SSE42-NEXT: psllw $15, %xmm8
+; SSE42-NEXT: psraw $15, %xmm8
+; SSE42-NEXT: pshufb %xmm2, %xmm8
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm13[0]
+; SSE42-NEXT: cmpltps %xmm7, %xmm12
+; SSE42-NEXT: pshufb %xmm3, %xmm12
+; SSE42-NEXT: cmpltps %xmm6, %xmm10
+; SSE42-NEXT: pshufb %xmm3, %xmm10
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm12[0]
+; SSE42-NEXT: psllw $15, %xmm10
+; SSE42-NEXT: psraw $15, %xmm10
+; SSE42-NEXT: pshufb %xmm2, %xmm10
+; SSE42-NEXT: cmpltps %xmm5, %xmm11
+; SSE42-NEXT: pshufb %xmm3, %xmm11
+; SSE42-NEXT: cmpltps %xmm4, %xmm9
+; SSE42-NEXT: pshufb %xmm3, %xmm9
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; SSE42-NEXT: psllw $15, %xmm9
+; SSE42-NEXT: psraw $15, %xmm9
+; SSE42-NEXT: pshufb %xmm2, %xmm9
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
+; SSE42-NEXT: pextrb $15, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm9, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm8, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcmpltps %ymm3, %ymm7, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
+; AVX1-NEXT: vmovaps {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm7, %xmm8, %xmm8
+; AVX1-NEXT: vandps %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vcmpltps %ymm2, %ymm6, %ymm2
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vandps %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vandps %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcmpltps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltps %ymm0, %ymm4, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcmpltps %ymm3, %ymm7, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vcmpltps %ymm2, %ymm6, %ymm2
+; AVX2-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vcmpltps %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX2-NEXT: vcmpltps %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm6
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $-1, %ecx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512-NEXT: vucomiss %xmm7, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm7
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm7
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm4
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm2, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm8
+; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm2, %xmm5
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm2, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm6
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm5, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512-NEXT: vucomiss %xmm7, %xmm0
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm0, %xmm5
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512-NEXT: vucomiss %xmm6, %xmm7
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm0, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512-NEXT: vucomiss %xmm4, %xmm5
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vucomiss %xmm3, %xmm1
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmoval %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm4
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512-NEXT: vucomiss %xmm5, %xmm6
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmoval %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vucomiss %xmm3, %xmm1
+; AVX512-NEXT: cmoval %ecx, %eax
+; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
+; AVX512-NEXT: retq
+ %1 = fcmp ogt <32 x float> %a0, %a1
+ ret <32 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v16i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE2-NEXT: pand %xmm10, %xmm9
+; SSE2-NEXT: pxor %xmm8, %xmm6
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: movdqa %xmm6, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm11
+; SSE2-NEXT: pand %xmm10, %xmm11
+; SSE2-NEXT: packuswb %xmm9, %xmm11
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pxor %xmm8, %xmm7
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm10, %xmm6
+; SSE2-NEXT: pxor %xmm8, %xmm4
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm10, %xmm4
+; SSE2-NEXT: packuswb %xmm6, %xmm4
+; SSE2-NEXT: packuswb %xmm11, %xmm4
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm5
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm5
+; SSE2-NEXT: pand %xmm10, %xmm5
+; SSE2-NEXT: pxor %xmm8, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: pxor %xmm8, %xmm1
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm3
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm10, %xmm3
+; SSE2-NEXT: pxor %xmm8, %xmm0
+; SSE2-NEXT: pxor {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pand %xmm10, %xmm0
+; SSE2-NEXT: packuswb %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v16i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE42-NEXT: pand %xmm8, %xmm7
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pand %xmm8, %xmm6
+; SSE42-NEXT: packuswb %xmm7, %xmm6
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pand %xmm8, %xmm5
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pand %xmm8, %xmm4
+; SSE42-NEXT: packuswb %xmm5, %xmm4
+; SSE42-NEXT: packuswb %xmm6, %xmm4
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pand %xmm8, %xmm3
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pand %xmm8, %xmm2
+; SSE42-NEXT: packuswb %xmm3, %xmm2
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pand %xmm8, %xmm1
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pand %xmm8, %xmm0
+; SSE42-NEXT: packuswb %xmm1, %xmm0
+; SSE42-NEXT: packuswb %xmm2, %xmm0
+; SSE42-NEXT: packuswb %xmm4, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v16i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v16i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpcmpgtq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v16i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: cmpq %rcx, %rdx
+; AVX512-NEXT: movq $-1, %rcx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm4, %rdx
+; AVX512-NEXT: vmovq %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vmovq %xmm5, %rdx
+; AVX512-NEXT: vmovq %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm7
+; AVX512-NEXT: vmovq %xmm5, %rdx
+; AVX512-NEXT: vmovq %xmm6, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm2, %rdx
+; AVX512-NEXT: vmovq %xmm0, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vmovq %xmm2, %rdx
+; AVX512-NEXT: vmovq %xmm4, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm4, %rdx
+; AVX512-NEXT: vmovq %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm6
+; AVX512-NEXT: vmovq %xmm4, %rdx
+; AVX512-NEXT: vmovq %xmm5, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm4
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgq %rcx, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm5
+; AVX512-NEXT: vmovq %xmm3, %rdx
+; AVX512-NEXT: vmovq %xmm1, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: cmovgq %rcx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <16 x i64> %a0, %a1
+ ret <16 x i1> %1
+}
+
+define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v32i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm0
+; SSE2-NEXT: psraw $15, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
+; SSE2-NEXT: psllw $15, %xmm4
+; SSE2-NEXT: psraw $15, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: packuswb %xmm2, %xmm4
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v32i32:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT: pshufb %xmm8, %xmm3
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pshufb %xmm8, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: psllw $15, %xmm2
+; SSE42-NEXT: psraw $15, %xmm2
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm3, %xmm2
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pshufb %xmm8, %xmm1
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pshufb %xmm8, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: psllw $15, %xmm0
+; SSE42-NEXT: psraw $15, %xmm0
+; SSE42-NEXT: pshufb %xmm3, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pshufb %xmm8, %xmm7
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pshufb %xmm8, %xmm6
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE42-NEXT: psllw $15, %xmm6
+; SSE42-NEXT: psraw $15, %xmm6
+; SSE42-NEXT: pshufb %xmm3, %xmm6
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pshufb %xmm8, %xmm5
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pshufb %xmm8, %xmm4
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE42-NEXT: psllw $15, %xmm4
+; SSE42-NEXT: psraw $15, %xmm4
+; SSE42-NEXT: pshufb %xmm3, %xmm4
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE42-NEXT: pextrb $15, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v32i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm8, %xmm9, %xmm9
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm9, %xmm3, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v32i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vpcmpgtd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v32i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %ecx
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: cmpl %ecx, %edx
+; AVX512-NEXT: movl $-1, %ecx
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm4, %esi
+; AVX512-NEXT: vmovd %xmm5, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm5, %esi
+; AVX512-NEXT: vmovd %xmm6, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm7
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm5, %esi
+; AVX512-NEXT: vmovd %xmm6, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm7
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512-NEXT: vpextrd $1, %xmm0, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm2, %esi
+; AVX512-NEXT: vmovd %xmm0, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512-NEXT: vpextrd $2, %xmm0, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512-NEXT: vpextrd $3, %xmm0, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm2, %esi
+; AVX512-NEXT: vmovd %xmm4, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512-NEXT: vpextrd $2, %xmm4, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512-NEXT: vpextrd $3, %xmm4, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm4, %esi
+; AVX512-NEXT: vmovd %xmm5, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm4, %esi
+; AVX512-NEXT: vmovd %xmm5, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm6
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512-NEXT: vpextrd $1, %xmm3, %edx
+; AVX512-NEXT: vpextrd $1, %xmm1, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vmovd %xmm3, %esi
+; AVX512-NEXT: vmovd %xmm1, %edi
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: movl $0, %esi
+; AVX512-NEXT: cmovgl %ecx, %esi
+; AVX512-NEXT: vmovd %esi, %xmm5
+; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $2, %xmm3, %edx
+; AVX512-NEXT: vpextrd $2, %xmm1, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: movl $0, %edx
+; AVX512-NEXT: cmovgl %ecx, %edx
+; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512-NEXT: vpextrd $3, %xmm3, %edx
+; AVX512-NEXT: vpextrd $3, %xmm1, %esi
+; AVX512-NEXT: cmpl %edx, %esi
+; AVX512-NEXT: cmovgl %ecx, %eax
+; AVX512-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = icmp sgt <32 x i32> %a0, %a1
+ ret <32 x i1> %1
+}
+
+define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v64i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pand %xmm8, %xmm3
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: packuswb %xmm5, %xmm4
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pand %xmm8, %xmm6
+; SSE2-NEXT: packuswb %xmm7, %xmm6
+; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v64i16:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm8 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; SSE42-NEXT: pshufb %xmm8, %xmm1
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pshufb %xmm8, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufb %xmm8, %xmm3
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pshufb %xmm8, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pshufb %xmm8, %xmm5
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pshufb %xmm8, %xmm4
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pshufb %xmm8, %xmm7
+; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pshufb %xmm8, %xmm6
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE42-NEXT: pextrb $15, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v64i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpcmpgtw %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; AVX1-NEXT: vpextrb $15, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v64i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]
+; AVX2-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX2-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX2-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; AVX2-NEXT: vpextrb $15, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v64i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm3
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm2
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vpsllw $7, %ymm2, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpand %ymm2, %ymm3, %ymm3
+; AVX512-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX512-NEXT: vpcmpgtb %ymm3, %ymm6, %ymm3
+; AVX512-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kshiftlw $14, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: kshiftlw $15, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $13, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $12, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $11, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $10, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $9, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $8, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $7, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $6, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $5, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $4, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $3, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $2, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $1, %k0, %k1
+; AVX512-NEXT: kshiftrw $15, %k1, %k1
+; AVX512-NEXT: kmovw %k1, %eax
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512-NEXT: kshiftlw $0, %k0, %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovw %k0, %eax
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vpcmpgtb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm1, %xmm1
+; AVX512-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm1
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm2
+; AVX512-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpgtb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm3
+; AVX512-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpcmpgtb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT: retq
+ %1 = icmp sgt <64 x i16> %a0, %a1
+ ret <64 x i1> %1
+}
+
+define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v128i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 14(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 12(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 10(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 8(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 6(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 4(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, 2(%rdi)
+; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT: andb $1, %al
+; SSE2-NEXT: movb %al, (%rdi)
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: popq %rcx
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_cmp_v128i8:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pextrb $15, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm7, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 14(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm6, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 12(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm5, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 10(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm4, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 8(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm3, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 6(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm2, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 4(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $12, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $11, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $10, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $9, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $8, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $7, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $6, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $5, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $4, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $3, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $2, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $1, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $0, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $0, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_cmp_v128i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm5, %xmm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm7, %xmm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpextrb $15, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm3, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm6, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 12(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm5, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 8(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm4, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, 4(%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $15, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $14, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $13, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $12, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $11, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $10, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $9, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $8, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $7, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $6, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $5, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $4, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $3, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $2, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $1, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: vpextrb $0, %xmm8, %eax
+; AVX1-NEXT: andb $1, %al
+; AVX1-NEXT: movb %al, (%rdi)
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_cmp_v128i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpextrb $15, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm4, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 12(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpextrb $15, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm3, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 8(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, 4(%rdi)
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $15, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $14, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $13, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $12, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $11, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $10, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $9, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $8, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $7, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $6, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $5, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $4, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $3, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $2, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $1, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; AVX2-NEXT: andb $1, %al
+; AVX2-NEXT: movb %al, (%rdi)
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cmp_v128i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
+; AVX512-NEXT: vpcmpgtb %ymm7, %ymm3, %ymm3
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512-NEXT: vpmovsxbd %xmm4, %zmm4
+; AVX512-NEXT: vpslld $31, %zmm4, %zmm4
+; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k0
+; AVX512-NEXT: kmovw %k0, 14(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT: kmovw %k0, 12(%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpmovsxbd %xmm3, %zmm3
+; AVX512-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512-NEXT: kmovw %k0, 10(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT: kmovw %k0, 8(%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2
+; AVX512-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512-NEXT: kmovw %k0, 6(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT: kmovw %k0, 4(%rdi)
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512-NEXT: kmovw %k0, 2(%rdi)
+; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512-NEXT: kmovw %k0, (%rdi)
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: retq
+ %1 = icmp sgt <128 x i8> %a0, %a1
+ ret <128 x i1> %1
+}
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index 47878360ca0a..4c5c348302b7 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -114,3 +114,12 @@ entry:
ret <4 x i16*> %A
;CHECK: ret
}
+
+;CHECK-LABEL: AGEP9:
+define <64 x i16*> @AGEP9(i16* %param, <64 x i32> %off) nounwind {
+entry:
+;CHECK: vbroadcastss
+ %A = getelementptr i16, i16* %param, <64 x i32> %off
+ ret <64 x i16*> %A
+;CHECK: ret
+}
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
new file mode 100644
index 000000000000..b091d1bca2ef
--- /dev/null
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -0,0 +1,3922 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
+
+;
+; Half to Float
+;
+
+define float @cvt_i16_to_f32(i16 %a0) {
+; ALL-LABEL: cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = bitcast i16 %a0 to half
+ %2 = fpext half %1 to float
+ ret float %2
+}
+
+define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) {
+; ALL-LABEL: cvt_4i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movq %rax, %rdx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrq $48, %rdx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %esi, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = bitcast <4 x i16> %a0 to <4 x half>
+ %2 = fpext <4 x half> %1 to <4 x float>
+ ret <4 x float> %2
+}
+
+define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) {
+; ALL-LABEL: cvt_8i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movq %rax, %rdx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrq $48, %rdx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %esi, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x float>
+ ret <4 x float> %3
+}
+
+define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) {
+; AVX1-LABEL: cvt_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r10
+; AVX1-NEXT: movswl %dx, %r9d
+; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: shrq $32, %r8
+; AVX1-NEXT: shrq $48, %r10
+; AVX1-NEXT: vmovq %xmm0, %rdi
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movswl %di, %ecx
+; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; AVX1-NEXT: shrl $16, %edi
+; AVX1-NEXT: shrq $32, %rax
+; AVX1-NEXT: shrq $48, %rsi
+; AVX1-NEXT: movswl %si, %esi
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: movswl %di, %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: movswl %r10w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: movswl %r8w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl %dx, %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: vmovd %r9d, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movswl %dx, %r9d
+; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: shrq $32, %r8
+; AVX2-NEXT: shrq $48, %r10
+; AVX2-NEXT: vmovq %xmm0, %rdi
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movswl %di, %ecx
+; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; AVX2-NEXT: shrl $16, %edi
+; AVX2-NEXT: shrq $32, %rax
+; AVX2-NEXT: shrq $48, %rsi
+; AVX2-NEXT: movswl %si, %esi
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: movswl %di, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vmovd %ecx, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: movswl %r10w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: movswl %r8w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl %dx, %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: vmovd %r9d, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8i16_to_8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r8
+; AVX512-NEXT: movq %rdx, %r10
+; AVX512-NEXT: movswl %dx, %r9d
+; AVX512-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; AVX512-NEXT: shrl $16, %edx
+; AVX512-NEXT: shrq $32, %r8
+; AVX512-NEXT: shrq $48, %r10
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: movq %rdi, %rsi
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; AVX512-NEXT: shrl $16, %edi
+; AVX512-NEXT: shrq $32, %rax
+; AVX512-NEXT: shrq $48, %rsi
+; AVX512-NEXT: movswl %si, %esi
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vmovd %ecx, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl %r8w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vmovd %r9d, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = bitcast <8 x i16> %a0 to <8 x half>
+ %2 = fpext <8 x half> %1 to <8 x float>
+ ret <8 x float> %2
+}
+
+define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) {
+; AVX1-LABEL: cvt_16i16_to_16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vmovq %xmm4, %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm8
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm9
+; AVX1-NEXT: movswl %ax, %ecx
+; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm10
+; AVX1-NEXT: vpextrq $1, %xmm4, %rax
+; AVX1-NEXT: vmovd %ecx, %xmm11
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm12
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm13
+; AVX1-NEXT: movswl %ax, %ecx
+; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm14
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vmovd %ecx, %xmm15
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm2
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: movswl %ax, %ecx
+; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm5
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm6
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: movswl %cx, %ecx
+; AVX1-NEXT: vmovd %ecx, %xmm7
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_16i16_to_16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm8
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm9
+; AVX2-NEXT: movswl %ax, %ecx
+; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm10
+; AVX2-NEXT: vpextrq $1, %xmm4, %rax
+; AVX2-NEXT: vmovd %ecx, %xmm11
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm12
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm13
+; AVX2-NEXT: movswl %ax, %ecx
+; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm14
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vmovd %ecx, %xmm15
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm2
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm3
+; AVX2-NEXT: movswl %ax, %ecx
+; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm5
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm6
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: movswl %cx, %ecx
+; AVX2-NEXT: vmovd %ecx, %xmm7
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_16i16_to_16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
+; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm8
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm9
+; AVX512-NEXT: movswl %ax, %ecx
+; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm11
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vmovd %ecx, %xmm12
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm13
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm14
+; AVX512-NEXT: movswl %ax, %ecx
+; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm15
+; AVX512-NEXT: vmovq %xmm10, %rax
+; AVX512-NEXT: vmovd %ecx, %xmm2
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm3
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: movswl %ax, %ecx
+; AVX512-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm10, %rax
+; AVX512-NEXT: vmovd %ecx, %xmm10
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $48, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm5
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: shrq $32, %rcx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm6
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movswl %cx, %ecx
+; AVX512-NEXT: vmovd %ecx, %xmm7
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX512-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX512-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX512-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX512-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX512-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast <16 x i16> %a0 to <16 x half>
+ %2 = fpext <16 x half> %1 to <16 x float>
+ ret <16 x float> %2
+}
+
+;
+; Half to Float (Load)
+;
+
+define float @load_cvt_i16_to_f32(i16* %a0) {
+; ALL-LABEL: load_cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = load i16, i16* %a0
+ %2 = bitcast i16 %1 to half
+ %3 = fpext half %2 to float
+ ret float %3
+}
+
+define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) {
+; ALL-LABEL: load_cvt_4i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = load <4 x i16>, <4 x i16>* %a0
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x float>
+ ret <4 x float> %3
+}
+
+define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) {
+; ALL-LABEL: load_cvt_8i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: movq (%rdi), %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movq %rax, %rdx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrq $48, %rdx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %esi, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = bitcast <4 x i16> %2 to <4 x half>
+ %4 = fpext <4 x half> %3 to <4 x float>
+ ret <4 x float> %4
+}
+
+define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) {
+; AVX1-LABEL: load_cvt_8i16_to_8f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl 6(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: movswl 4(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: movswl 2(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl 8(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_cvt_8i16_to_8f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: movswl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: movswl 14(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: movswl 12(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl 8(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl 10(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_cvt_8i16_to_8f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = bitcast <8 x i16> %1 to <8 x half>
+ %3 = fpext <8 x half> %2 to <8 x float>
+ ret <8 x float> %3
+}
+
+define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) {
+; AVX1-LABEL: load_cvt_16i16_to_16f32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl 22(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8
+; AVX1-NEXT: movswl 20(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9
+; AVX1-NEXT: movswl 16(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10
+; AVX1-NEXT: movswl 18(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11
+; AVX1-NEXT: movswl 30(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12
+; AVX1-NEXT: movswl 28(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13
+; AVX1-NEXT: movswl 24(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14
+; AVX1-NEXT: movswl 26(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15
+; AVX1-NEXT: movswl 6(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: movswl 4(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX1-NEXT: movswl 2(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm4
+; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl 8(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_cvt_16i16_to_16f32:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl 22(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8
+; AVX2-NEXT: movswl 20(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9
+; AVX2-NEXT: movswl 16(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10
+; AVX2-NEXT: movswl 18(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11
+; AVX2-NEXT: movswl 30(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12
+; AVX2-NEXT: movswl 28(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13
+; AVX2-NEXT: movswl 24(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14
+; AVX2-NEXT: movswl 26(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15
+; AVX2-NEXT: movswl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT: movswl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm4
+; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX2-NEXT: movswl 14(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl 12(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl 8(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: movswl 10(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_cvt_16i16_to_16f32:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm8
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm9
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm10
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm11
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm12
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm13
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm14
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm15
+; AVX512-NEXT: movswl 22(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 20(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 16(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 18(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 30(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 28(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 24(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 26(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <16 x i16>, <16 x i16>* %a0
+ %2 = bitcast <16 x i16> %1 to <16 x half>
+ %3 = fpext <16 x half> %2 to <16 x float>
+ ret <16 x float> %3
+}
+
+;
+; Half to Double
+;
+
+define double @cvt_i16_to_f64(i16 %a0) {
+; ALL-LABEL: cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = bitcast i16 %a0 to half
+ %2 = fpext half %1 to double
+ ret double %2
+}
+
+define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) {
+; ALL-LABEL: cvt_2i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movswl %ax, %ecx
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: retq
+ %1 = bitcast <2 x i16> %a0 to <2 x half>
+ %2 = fpext <2 x half> %1 to <2 x double>
+ ret <2 x double> %2
+}
+
+define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) {
+; ALL-LABEL: cvt_4i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %esi, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = bitcast <4 x i16> %a0 to <4 x half>
+ %2 = fpext <4 x half> %1 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) {
+; ALL-LABEL: cvt_8i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movswl %ax, %ecx
+; ALL-NEXT: shrl $16, %eax
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %ecx, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+ %2 = bitcast <2 x i16> %1 to <2 x half>
+ %3 = fpext <2 x half> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) {
+; ALL-LABEL: cvt_8i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq %xmm0, %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %esi, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x double>
+ ret <4 x double> %3
+}
+
+define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) {
+; AVX1-LABEL: cvt_8i16_to_8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovq %xmm0, %rdx
+; AVX1-NEXT: movq %rdx, %r9
+; AVX1-NEXT: movl %edx, %r10d
+; AVX1-NEXT: movswl %dx, %r8d
+; AVX1-NEXT: shrq $48, %rdx
+; AVX1-NEXT: shrq $32, %r9
+; AVX1-NEXT: shrl $16, %r10d
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movl %edi, %eax
+; AVX1-NEXT: movswl %di, %ecx
+; AVX1-NEXT: shrq $48, %rdi
+; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX1-NEXT: vmovd %ecx, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX1-NEXT: movswl %si, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX1-NEXT: movswl %di, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX1-NEXT: movswl %r10w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: vmovd %r8d, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl %r9w, %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl %dx, %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8i16_to_8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: movq %rdx, %r9
+; AVX2-NEXT: movl %edx, %r10d
+; AVX2-NEXT: movswl %dx, %r8d
+; AVX2-NEXT: shrq $48, %rdx
+; AVX2-NEXT: shrq $32, %r9
+; AVX2-NEXT: shrl $16, %r10d
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: movswl %di, %ecx
+; AVX2-NEXT: shrq $48, %rdi
+; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX2-NEXT: vmovd %ecx, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX2-NEXT: movswl %si, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX2-NEXT: movswl %di, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX2-NEXT: movswl %r10w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: vmovd %r8d, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl %r9w, %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl %dx, %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r8
+; AVX512-NEXT: movl %edx, %r10d
+; AVX512-NEXT: movswl %dx, %r9d
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: shrq $32, %r8
+; AVX512-NEXT: shrl $16, %r10d
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: movl %edi, %esi
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: shrq $32, %rax
+; AVX512-NEXT: shrl $16, %esi
+; AVX512-NEXT: movswl %si, %esi
+; AVX512-NEXT: vmovd %esi, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vmovd %r9d, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %r8w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = bitcast <8 x i16> %a0 to <8 x half>
+ %2 = fpext <8 x half> %1 to <8 x double>
+ ret <8 x double> %2
+}
+
+;
+; Half to Double (Load)
+;
+
+define double @load_cvt_i16_to_f64(i16* %a0) {
+; ALL-LABEL: load_cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
+ %1 = load i16, i16* %a0
+ %2 = bitcast i16 %1 to half
+ %3 = fpext half %2 to double
+ ret double %3
+}
+
+define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) {
+; ALL-LABEL: load_cvt_2i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: retq
+ %1 = load <2 x i16>, <2 x i16>* %a0
+ %2 = bitcast <2 x i16> %1 to <2 x half>
+ %3 = fpext <2 x half> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) {
+; ALL-LABEL: load_cvt_4i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = load <4 x i16>, <4 x i16>* %a0
+ %2 = bitcast <4 x i16> %1 to <4 x half>
+ %3 = fpext <4 x half> %2 to <4 x double>
+ ret <4 x double> %3
+}
+
+define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) {
+; ALL-LABEL: load_cvt_8i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: movq (%rdi), %rax
+; ALL-NEXT: movq %rax, %rcx
+; ALL-NEXT: movl %eax, %edx
+; ALL-NEXT: movswl %ax, %esi
+; ALL-NEXT: shrq $48, %rax
+; ALL-NEXT: shrq $32, %rcx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: movswl %dx, %edx
+; ALL-NEXT: vmovd %edx, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vmovd %esi, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %cx, %ecx
+; ALL-NEXT: vmovd %ecx, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = bitcast <4 x i16> %2 to <4 x half>
+ %4 = fpext <4 x half> %3 to <4 x double>
+ ret <4 x double> %4
+}
+
+define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) {
+; AVX1-LABEL: load_cvt_8i16_to_8f64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movswl 8(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX1-NEXT: movswl 10(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX1-NEXT: movswl 12(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX1-NEXT: movswl 14(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX1-NEXT: movswl 2(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm5
+; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX1-NEXT: movswl 4(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm6
+; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX1-NEXT: movswl 6(%rdi), %eax
+; AVX1-NEXT: vmovd %eax, %xmm7
+; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_cvt_8i16_to_8f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movswl 8(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX2-NEXT: movswl 10(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX2-NEXT: movswl 12(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
+; AVX2-NEXT: movswl 14(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm4
+; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT: movswl 2(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm5
+; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX2-NEXT: movswl 4(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm6
+; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX2-NEXT: movswl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm7
+; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %1 = load <8 x i16>, <8 x i16>* %a0
+ %2 = bitcast <8 x i16> %1 to <8 x half>
+ %3 = fpext <8 x half> %2 to <8 x double>
+ ret <8 x double> %3
+}
+
+;
+; Float to Half
+;
+
+define i16 @cvt_f32_to_i16(float %a0) {
+; ALL-LABEL: cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
+ %1 = fptrunc float %a0 to half
+ %2 = bitcast half %1 to i16
+ ret i16 %2
+}
+
+define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) {
+; ALL-LABEL: cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ ret <4 x i16> %2
+}
+
+define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) {
+; ALL-LABEL: cvt_4f32_to_8i16_undef:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) {
+; ALL-LABEL: cvt_4f32_to_8i16_zero:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) {
+; AVX1-LABEL: cvt_8f32_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: movzwl %cx, %ecx
+; AVX1-NEXT: orl %eax, %ecx
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %edx
+; AVX1-NEXT: shll $16, %edx
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %edx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: shll $16, %ecx
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %edx
+; AVX1-NEXT: movzwl %dx, %edx
+; AVX1-NEXT: orl %ecx, %edx
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %ecx
+; AVX1-NEXT: shll $16, %ecx
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %esi
+; AVX1-NEXT: movzwl %si, %esi
+; AVX1-NEXT: orl %ecx, %esi
+; AVX1-NEXT: shlq $32, %rsi
+; AVX1-NEXT: orq %rdx, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm0
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8f32_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: shll $16, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %ecx
+; AVX2-NEXT: movzwl %cx, %ecx
+; AVX2-NEXT: orl %eax, %ecx
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: shll $16, %edx
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %edx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %ecx
+; AVX2-NEXT: shll $16, %ecx
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %edx
+; AVX2-NEXT: movzwl %dx, %edx
+; AVX2-NEXT: orl %ecx, %edx
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %ecx
+; AVX2-NEXT: shll $16, %ecx
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %esi
+; AVX2-NEXT: movzwl %si, %esi
+; AVX2-NEXT: orl %ecx, %esi
+; AVX2-NEXT: shlq $32, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vmovq %rsi, %xmm0
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8f32_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: shll $16, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %ecx
+; AVX512-NEXT: movzwl %cx, %ecx
+; AVX512-NEXT: orl %eax, %ecx
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %edx
+; AVX512-NEXT: shll $16, %edx
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %edx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %rcx, %rax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %ecx
+; AVX512-NEXT: shll $16, %ecx
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %edx
+; AVX512-NEXT: movzwl %dx, %edx
+; AVX512-NEXT: orl %ecx, %edx
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %ecx
+; AVX512-NEXT: shll $16, %ecx
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %esi
+; AVX512-NEXT: movzwl %si, %esi
+; AVX512-NEXT: orl %ecx, %esi
+; AVX512-NEXT: shlq $32, %rsi
+; AVX512-NEXT: orq %rdx, %rsi
+; AVX512-NEXT: vmovq %rsi, %xmm0
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x float> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) {
+; AVX1-LABEL: cvt_16f32_to_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_16f32_to_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %1 = fptrunc <16 x float> %a0 to <16 x half>
+ %2 = bitcast <16 x half> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+;
+; Float to Half (Store)
+;
+
+define void @store_cvt_f32_to_i16(float %a0, i16* %a1) {
+; ALL-LABEL: store_cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movw %ax, (%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc float %a0 to half
+ %2 = bitcast half %1 to i16
+ store i16 %2, i16* %a1
+ ret void
+}
+
+define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) {
+; ALL-LABEL: store_cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, (%rdi)
+; ALL-NEXT: movw %dx, 6(%rdi)
+; ALL-NEXT: movw %cx, 4(%rdi)
+; ALL-NEXT: movw %ax, 2(%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ store <4 x i16> %2, <4 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) {
+; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; ALL-NEXT: vmovdqa %xmm0, (%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) {
+; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT: vmovdqa %xmm0, (%rdi)
+; ALL-NEXT: retq
+ %1 = fptrunc <4 x float> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_8f32_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %r8d
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %r9d
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovd %xmm1, %r10d
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %r11d
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %ecx
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %edx
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX1-NEXT: vmovd %xmm0, %esi
+; AVX1-NEXT: movw %si, 8(%rdi)
+; AVX1-NEXT: movw %dx, (%rdi)
+; AVX1-NEXT: movw %cx, 14(%rdi)
+; AVX1-NEXT: movw %ax, 12(%rdi)
+; AVX1-NEXT: movw %r11w, 10(%rdi)
+; AVX1-NEXT: movw %r10w, 6(%rdi)
+; AVX1-NEXT: movw %r9w, 4(%rdi)
+; AVX1-NEXT: movw %r8w, 2(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_8f32_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %r8d
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %r9d
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovd %xmm1, %r10d
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %r11d
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %ecx
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %esi
+; AVX2-NEXT: movw %si, 8(%rdi)
+; AVX2-NEXT: movw %dx, (%rdi)
+; AVX2-NEXT: movw %cx, 14(%rdi)
+; AVX2-NEXT: movw %ax, 12(%rdi)
+; AVX2-NEXT: movw %r11w, 10(%rdi)
+; AVX2-NEXT: movw %r10w, 6(%rdi)
+; AVX2-NEXT: movw %r9w, 4(%rdi)
+; AVX2-NEXT: movw %r8w, 2(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_8f32_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %r8d
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %r9d
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %xmm1, %r10d
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %r11d
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %ecx
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %edx
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %esi
+; AVX512-NEXT: movw %si, 8(%rdi)
+; AVX512-NEXT: movw %dx, (%rdi)
+; AVX512-NEXT: movw %cx, 14(%rdi)
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: movw %r11w, 10(%rdi)
+; AVX512-NEXT: movw %r10w, 6(%rdi)
+; AVX512-NEXT: movw %r9w, 4(%rdi)
+; AVX512-NEXT: movw %r8w, 2(%rdi)
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x float> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ store <8 x i16> %2, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_16f32_to_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX1-NEXT: movw %ax, 24(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX1-NEXT: movw %ax, 16(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX1-NEXT: movw %ax, 8(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX1-NEXT: movw %ax, (%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX1-NEXT: movw %ax, 30(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: movw %ax, 28(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: movw %ax, 26(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: movw %ax, 22(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: movw %ax, 20(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: movw %ax, 18(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX1-NEXT: movw %ax, 14(%rdi)
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: movw %ax, 12(%rdi)
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: movw %ax, 10(%rdi)
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: movw %ax, 6(%rdi)
+; AVX1-NEXT: vmovd %xmm3, %eax
+; AVX1-NEXT: movw %ax, 4(%rdi)
+; AVX1-NEXT: vmovd %xmm4, %eax
+; AVX1-NEXT: movw %ax, 2(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_16f32_to_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX2-NEXT: movw %ax, 24(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX2-NEXT: movw %ax, 16(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX2-NEXT: movw %ax, 8(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX2-NEXT: movw %ax, (%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX2-NEXT: movw %ax, 30(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: movw %ax, 28(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: movw %ax, 26(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: movw %ax, 22(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: movw %ax, 20(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: movw %ax, 18(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX2-NEXT: movw %ax, 14(%rdi)
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: movw %ax, 12(%rdi)
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: movw %ax, 10(%rdi)
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: movw %ax, 6(%rdi)
+; AVX2-NEXT: vmovd %xmm3, %eax
+; AVX2-NEXT: movw %ax, 4(%rdi)
+; AVX2-NEXT: vmovd %xmm4, %eax
+; AVX2-NEXT: movw %ax, 2(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX512-NEXT: movw %ax, 24(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX512-NEXT: movw %ax, 16(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX512-NEXT: movw %ax, 8(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, 30(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 28(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 26(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 22(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 20(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 18(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: movw %ax, 14(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: movw %ax, 10(%rdi)
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: movw %ax, 6(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: movw %ax, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: movw %ax, 2(%rdi)
+; AVX512-NEXT: retq
+ %1 = fptrunc <16 x float> %a0 to <16 x half>
+ %2 = bitcast <16 x half> %1 to <16 x i16>
+ store <16 x i16> %2, <16 x i16>* %a1
+ ret void
+}
+
+;
+; Double to Half
+;
+
+define i16 @cvt_f64_to_i16(double %a0) {
+; ALL-LABEL: cvt_f64_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: jmp __truncdfhf2 # TAILCALL
+ %1 = fptrunc double %a0 to half
+ %2 = bitcast half %1 to i16
+ ret i16 %2
+}
+
+define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) {
+; ALL-LABEL: cvt_2f64_to_2i16:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: .Ltmp0:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: subq $16, %rsp
+; ALL-NEXT: .Ltmp1:
+; ALL-NEXT: .cfi_def_cfa_offset 32
+; ALL-NEXT: .Ltmp2:
+; ALL-NEXT: .cfi_offset %rbx, -16
+; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movw %ax, %bx
+; ALL-NEXT: shll $16, %ebx
+; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movzwl %ax, %eax
+; ALL-NEXT: orl %ebx, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: addq $16, %rsp
+; ALL-NEXT: popq %rbx
+; ALL-NEXT: retq
+ %1 = fptrunc <2 x double> %a0 to <2 x half>
+ %2 = bitcast <2 x half> %1 to <2 x i16>
+ ret <2 x i16> %2
+}
+
+define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) {
+; AVX1-LABEL: cvt_4f64_to_4i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp3:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp4:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: subq $40, %rsp
+; AVX1-NEXT: .Ltmp5:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp6:
+; AVX1-NEXT: .cfi_offset %rbx, -24
+; AVX1-NEXT: .Ltmp7:
+; AVX1-NEXT: .cfi_offset %r14, -16
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r14, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: addq $40, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_4f64_to_4i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp3:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp4:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: subq $40, %rsp
+; AVX2-NEXT: .Ltmp5:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp6:
+; AVX2-NEXT: .cfi_offset %rbx, -24
+; AVX2-NEXT: .Ltmp7:
+; AVX2-NEXT: .cfi_offset %r14, -16
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r14, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: addq $40, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_4i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp3:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp4:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: .Ltmp5:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp6:
+; AVX512-NEXT: .cfi_offset %rbx, -24
+; AVX512-NEXT: .Ltmp7:
+; AVX512-NEXT: .cfi_offset %r14, -16
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ ret <4 x i16> %2
+}
+
+define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) {
+; AVX1-LABEL: cvt_4f64_to_8i16_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp8:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp9:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: subq $40, %rsp
+; AVX1-NEXT: .Ltmp10:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp11:
+; AVX1-NEXT: .cfi_offset %rbx, -24
+; AVX1-NEXT: .Ltmp12:
+; AVX1-NEXT: .cfi_offset %r14, -16
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r14, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: addq $40, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_4f64_to_8i16_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp8:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp9:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: subq $40, %rsp
+; AVX2-NEXT: .Ltmp10:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp11:
+; AVX2-NEXT: .cfi_offset %rbx, -24
+; AVX2-NEXT: .Ltmp12:
+; AVX2-NEXT: .cfi_offset %r14, -16
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r14, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: addq $40, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_8i16_undef:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp8:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp9:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: .Ltmp10:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp11:
+; AVX512-NEXT: .cfi_offset %rbx, -24
+; AVX512-NEXT: .Ltmp12:
+; AVX512-NEXT: .cfi_offset %r14, -16
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) {
+; AVX1-LABEL: cvt_4f64_to_8i16_zero:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp13:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp14:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: subq $40, %rsp
+; AVX1-NEXT: .Ltmp15:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp16:
+; AVX1-NEXT: .cfi_offset %rbx, -24
+; AVX1-NEXT: .Ltmp17:
+; AVX1-NEXT: .cfi_offset %r14, -16
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r14, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: addq $40, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_4f64_to_8i16_zero:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp13:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp14:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: subq $40, %rsp
+; AVX2-NEXT: .Ltmp15:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp16:
+; AVX2-NEXT: .cfi_offset %rbx, -24
+; AVX2-NEXT: .Ltmp17:
+; AVX2-NEXT: .cfi_offset %r14, -16
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r14, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: addq $40, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_4f64_to_8i16_zero:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp13:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp14:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: .Ltmp15:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp16:
+; AVX512-NEXT: .cfi_offset %rbx, -24
+; AVX512-NEXT: .Ltmp17:
+; AVX512-NEXT: .cfi_offset %r14, -16
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) {
+; AVX1-LABEL: cvt_8f64_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Ltmp18:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp19:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp20:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: .Ltmp21:
+; AVX1-NEXT: .cfi_def_cfa_offset 96
+; AVX1-NEXT: .Ltmp22:
+; AVX1-NEXT: .cfi_offset %rbx, -32
+; AVX1-NEXT: .Ltmp23:
+; AVX1-NEXT: .cfi_offset %r14, -24
+; AVX1-NEXT: .Ltmp24:
+; AVX1-NEXT: .cfi_offset %r15, -16
+; AVX1-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r15d
+; AVX1-NEXT: orl %ebx, %r15d
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r14d
+; AVX1-NEXT: orl %ebx, %r14d
+; AVX1-NEXT: shlq $32, %r14
+; AVX1-NEXT: orq %r15, %r14
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %r15d
+; AVX1-NEXT: orl %ebx, %r15d
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: shll $16, %ebx
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %r15, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vmovq %r14, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: addq $64, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8f64_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Ltmp18:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp19:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp20:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: .Ltmp21:
+; AVX2-NEXT: .cfi_def_cfa_offset 96
+; AVX2-NEXT: .Ltmp22:
+; AVX2-NEXT: .cfi_offset %rbx, -32
+; AVX2-NEXT: .Ltmp23:
+; AVX2-NEXT: .cfi_offset %r14, -24
+; AVX2-NEXT: .Ltmp24:
+; AVX2-NEXT: .cfi_offset %r15, -16
+; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r15d
+; AVX2-NEXT: orl %ebx, %r15d
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r14d
+; AVX2-NEXT: orl %ebx, %r14d
+; AVX2-NEXT: shlq $32, %r14
+; AVX2-NEXT: orq %r15, %r14
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %r15d
+; AVX2-NEXT: orl %ebx, %r15d
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: shll $16, %ebx
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebx, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %r15, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vmovq %r14, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: addq $64, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: cvt_8f64_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Ltmp18:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp19:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp20:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: subq $96, %rsp
+; AVX512-NEXT: .Ltmp21:
+; AVX512-NEXT: .cfi_def_cfa_offset 128
+; AVX512-NEXT: .Ltmp22:
+; AVX512-NEXT: .cfi_offset %rbx, -32
+; AVX512-NEXT: .Ltmp23:
+; AVX512-NEXT: .cfi_offset %r14, -24
+; AVX512-NEXT: .Ltmp24:
+; AVX512-NEXT: .cfi_offset %r15, -16
+; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r15d
+; AVX512-NEXT: orl %ebx, %r15d
+; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: shlq $32, %r14
+; AVX512-NEXT: orq %r15, %r14
+; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r15d
+; AVX512-NEXT: orl %ebx, %r15d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r15, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vmovq %r14, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: addq $96, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x double> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+;
+; Double to Half (Store)
+;
+
+define void @store_cvt_f64_to_i16(double %a0, i16* %a1) {
+; ALL-LABEL: store_cvt_f64_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: .Ltmp25:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: .Ltmp26:
+; ALL-NEXT: .cfi_offset %rbx, -16
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movw %ax, (%rbx)
+; ALL-NEXT: popq %rbx
+; ALL-NEXT: retq
+ %1 = fptrunc double %a0 to half
+ %2 = bitcast half %1 to i16
+ store i16 %2, i16* %a1
+ ret void
+}
+
+define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) {
+; ALL-LABEL: store_cvt_2f64_to_2i16:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: .Ltmp27:
+; ALL-NEXT: .cfi_def_cfa_offset 16
+; ALL-NEXT: pushq %rbx
+; ALL-NEXT: .Ltmp28:
+; ALL-NEXT: .cfi_def_cfa_offset 24
+; ALL-NEXT: subq $24, %rsp
+; ALL-NEXT: .Ltmp29:
+; ALL-NEXT: .cfi_def_cfa_offset 48
+; ALL-NEXT: .Ltmp30:
+; ALL-NEXT: .cfi_offset %rbx, -24
+; ALL-NEXT: .Ltmp31:
+; ALL-NEXT: .cfi_offset %rbp, -16
+; ALL-NEXT: movq %rdi, %rbx
+; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movl %eax, %ebp
+; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; ALL-NEXT: callq __truncdfhf2
+; ALL-NEXT: movw %ax, (%rbx)
+; ALL-NEXT: movw %bp, 2(%rbx)
+; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: popq %rbx
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
+ %1 = fptrunc <2 x double> %a0 to <2 x half>
+ %2 = bitcast <2 x half> %1 to <2 x i16>
+ store <2 x i16> %2, <2 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_4f64_to_4i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp32:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Ltmp33:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp34:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp35:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: subq $88, %rsp
+; AVX1-NEXT: .Ltmp36:
+; AVX1-NEXT: .cfi_def_cfa_offset 128
+; AVX1-NEXT: .Ltmp37:
+; AVX1-NEXT: .cfi_offset %rbx, -40
+; AVX1-NEXT: .Ltmp38:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Ltmp39:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: .Ltmp40:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r14d
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r15d
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %ebp
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, 4(%rbx)
+; AVX1-NEXT: movw %bp, (%rbx)
+; AVX1-NEXT: movw %r15w, 6(%rbx)
+; AVX1-NEXT: movw %r14w, 2(%rbx)
+; AVX1-NEXT: addq $88, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_4f64_to_4i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp32:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Ltmp33:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp34:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp35:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: subq $88, %rsp
+; AVX2-NEXT: .Ltmp36:
+; AVX2-NEXT: .cfi_def_cfa_offset 128
+; AVX2-NEXT: .Ltmp37:
+; AVX2-NEXT: .cfi_offset %rbx, -40
+; AVX2-NEXT: .Ltmp38:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Ltmp39:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: .Ltmp40:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r14d
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r15d
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %ebp
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, 4(%rbx)
+; AVX2-NEXT: movw %bp, (%rbx)
+; AVX2-NEXT: movw %r15w, 6(%rbx)
+; AVX2-NEXT: movw %r14w, 2(%rbx)
+; AVX2-NEXT: addq $88, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_4i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp32:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Ltmp33:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp34:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp35:
+; AVX512-NEXT: .cfi_def_cfa_offset 40
+; AVX512-NEXT: subq $88, %rsp
+; AVX512-NEXT: .Ltmp36:
+; AVX512-NEXT: .cfi_def_cfa_offset 128
+; AVX512-NEXT: .Ltmp37:
+; AVX512-NEXT: .cfi_offset %rbx, -40
+; AVX512-NEXT: .Ltmp38:
+; AVX512-NEXT: .cfi_offset %r14, -32
+; AVX512-NEXT: .Ltmp39:
+; AVX512-NEXT: .cfi_offset %r15, -24
+; AVX512-NEXT: .Ltmp40:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r14d
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r15d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, 4(%rbx)
+; AVX512-NEXT: movw %bp, (%rbx)
+; AVX512-NEXT: movw %r15w, 6(%rbx)
+; AVX512-NEXT: movw %r14w, 2(%rbx)
+; AVX512-NEXT: addq $88, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ store <4 x i16> %2, <4 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp41:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp42:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp43:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: .Ltmp44:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp45:
+; AVX1-NEXT: .cfi_offset %rbx, -32
+; AVX1-NEXT: .Ltmp46:
+; AVX1-NEXT: .cfi_offset %r14, -24
+; AVX1-NEXT: .Ltmp47:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %r14
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %ebx
+; AVX1-NEXT: orl %ebp, %ebx
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebp, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rbx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vmovdqa %xmm0, (%r14)
+; AVX1-NEXT: addq $32, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp41:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp42:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp43:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: .Ltmp44:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp45:
+; AVX2-NEXT: .cfi_offset %rbx, -32
+; AVX2-NEXT: .Ltmp46:
+; AVX2-NEXT: .cfi_offset %r14, -24
+; AVX2-NEXT: .Ltmp47:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %r14
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %ebx
+; AVX2-NEXT: orl %ebp, %ebx
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebp, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vmovdqa %xmm0, (%r14)
+; AVX2-NEXT: addq $32, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp41:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp42:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp43:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: subq $32, %rsp
+; AVX512-NEXT: .Ltmp44:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp45:
+; AVX512-NEXT: .cfi_offset %rbx, -32
+; AVX512-NEXT: .Ltmp46:
+; AVX512-NEXT: .cfi_offset %r14, -24
+; AVX512-NEXT: .Ltmp47:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %r14
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %ebx
+; AVX512-NEXT: orl %ebp, %ebx
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebp, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %rbx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vmovdqa %xmm0, (%r14)
+; AVX512-NEXT: addq $32, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp48:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp49:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp50:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: subq $32, %rsp
+; AVX1-NEXT: .Ltmp51:
+; AVX1-NEXT: .cfi_def_cfa_offset 64
+; AVX1-NEXT: .Ltmp52:
+; AVX1-NEXT: .cfi_offset %rbx, -32
+; AVX1-NEXT: .Ltmp53:
+; AVX1-NEXT: .cfi_offset %r14, -24
+; AVX1-NEXT: .Ltmp54:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %r14
+; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %ebx
+; AVX1-NEXT: orl %ebp, %ebx
+; AVX1-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: shll $16, %ebp
+; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movzwl %ax, %eax
+; AVX1-NEXT: orl %ebp, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rbx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovdqa %xmm0, (%r14)
+; AVX1-NEXT: addq $32, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp48:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp49:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp50:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: subq $32, %rsp
+; AVX2-NEXT: .Ltmp51:
+; AVX2-NEXT: .cfi_def_cfa_offset 64
+; AVX2-NEXT: .Ltmp52:
+; AVX2-NEXT: .cfi_offset %rbx, -32
+; AVX2-NEXT: .Ltmp53:
+; AVX2-NEXT: .cfi_offset %r14, -24
+; AVX2-NEXT: .Ltmp54:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %r14
+; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %ebx
+; AVX2-NEXT: orl %ebp, %ebx
+; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: shll $16, %ebp
+; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movzwl %ax, %eax
+; AVX2-NEXT: orl %ebp, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vmovdqa %xmm0, (%r14)
+; AVX2-NEXT: addq $32, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_4f64_to_8i16_zero:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp48:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp49:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp50:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: subq $32, %rsp
+; AVX512-NEXT: .Ltmp51:
+; AVX512-NEXT: .cfi_def_cfa_offset 64
+; AVX512-NEXT: .Ltmp52:
+; AVX512-NEXT: .cfi_offset %rbx, -32
+; AVX512-NEXT: .Ltmp53:
+; AVX512-NEXT: .cfi_offset %r14, -24
+; AVX512-NEXT: .Ltmp54:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %r14
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %ebx
+; AVX512-NEXT: orl %ebp, %ebx
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, %bp
+; AVX512-NEXT: shll $16, %ebp
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebp, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %rbx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa %xmm0, (%r14)
+; AVX512-NEXT: addq $32, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <4 x double> %a0 to <4 x half>
+ %2 = bitcast <4 x half> %1 to <4 x i16>
+ %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x i16> %3, <8 x i16>* %a1
+ ret void
+}
+
+define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) {
+; AVX1-LABEL: store_cvt_8f64_to_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: .Ltmp55:
+; AVX1-NEXT: .cfi_def_cfa_offset 16
+; AVX1-NEXT: pushq %r15
+; AVX1-NEXT: .Ltmp56:
+; AVX1-NEXT: .cfi_def_cfa_offset 24
+; AVX1-NEXT: pushq %r14
+; AVX1-NEXT: .Ltmp57:
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: pushq %r13
+; AVX1-NEXT: .Ltmp58:
+; AVX1-NEXT: .cfi_def_cfa_offset 40
+; AVX1-NEXT: pushq %r12
+; AVX1-NEXT: .Ltmp59:
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: .Ltmp60:
+; AVX1-NEXT: .cfi_def_cfa_offset 56
+; AVX1-NEXT: subq $136, %rsp
+; AVX1-NEXT: .Ltmp61:
+; AVX1-NEXT: .cfi_def_cfa_offset 192
+; AVX1-NEXT: .Ltmp62:
+; AVX1-NEXT: .cfi_offset %rbx, -56
+; AVX1-NEXT: .Ltmp63:
+; AVX1-NEXT: .cfi_offset %r12, -48
+; AVX1-NEXT: .Ltmp64:
+; AVX1-NEXT: .cfi_offset %r13, -40
+; AVX1-NEXT: .Ltmp65:
+; AVX1-NEXT: .cfi_offset %r14, -32
+; AVX1-NEXT: .Ltmp66:
+; AVX1-NEXT: .cfi_offset %r15, -24
+; AVX1-NEXT: .Ltmp67:
+; AVX1-NEXT: .cfi_offset %rbp, -16
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r12d
+; AVX1-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r13d
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %ebp
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r14d
+; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movl %eax, %r15d
+; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: callq __truncdfhf2
+; AVX1-NEXT: movw %ax, 12(%rbx)
+; AVX1-NEXT: movw %r15w, 8(%rbx)
+; AVX1-NEXT: movw %r14w, 4(%rbx)
+; AVX1-NEXT: movw %bp, (%rbx)
+; AVX1-NEXT: movw %r13w, 14(%rbx)
+; AVX1-NEXT: movw %r12w, 10(%rbx)
+; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX1-NEXT: movw %ax, 6(%rbx)
+; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX1-NEXT: movw %ax, 2(%rbx)
+; AVX1-NEXT: addq $136, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: popq %r12
+; AVX1-NEXT: popq %r13
+; AVX1-NEXT: popq %r14
+; AVX1-NEXT: popq %r15
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_8f64_to_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: .Ltmp55:
+; AVX2-NEXT: .cfi_def_cfa_offset 16
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: .Ltmp56:
+; AVX2-NEXT: .cfi_def_cfa_offset 24
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: .Ltmp57:
+; AVX2-NEXT: .cfi_def_cfa_offset 32
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: .Ltmp58:
+; AVX2-NEXT: .cfi_def_cfa_offset 40
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: .Ltmp59:
+; AVX2-NEXT: .cfi_def_cfa_offset 48
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: .Ltmp60:
+; AVX2-NEXT: .cfi_def_cfa_offset 56
+; AVX2-NEXT: subq $136, %rsp
+; AVX2-NEXT: .Ltmp61:
+; AVX2-NEXT: .cfi_def_cfa_offset 192
+; AVX2-NEXT: .Ltmp62:
+; AVX2-NEXT: .cfi_offset %rbx, -56
+; AVX2-NEXT: .Ltmp63:
+; AVX2-NEXT: .cfi_offset %r12, -48
+; AVX2-NEXT: .Ltmp64:
+; AVX2-NEXT: .cfi_offset %r13, -40
+; AVX2-NEXT: .Ltmp65:
+; AVX2-NEXT: .cfi_offset %r14, -32
+; AVX2-NEXT: .Ltmp66:
+; AVX2-NEXT: .cfi_offset %r15, -24
+; AVX2-NEXT: .Ltmp67:
+; AVX2-NEXT: .cfi_offset %rbp, -16
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r12d
+; AVX2-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r13d
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %ebp
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r14d
+; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movl %eax, %r15d
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: callq __truncdfhf2
+; AVX2-NEXT: movw %ax, 12(%rbx)
+; AVX2-NEXT: movw %r15w, 8(%rbx)
+; AVX2-NEXT: movw %r14w, 4(%rbx)
+; AVX2-NEXT: movw %bp, (%rbx)
+; AVX2-NEXT: movw %r13w, 14(%rbx)
+; AVX2-NEXT: movw %r12w, 10(%rbx)
+; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX2-NEXT: movw %ax, 6(%rbx)
+; AVX2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX2-NEXT: movw %ax, 2(%rbx)
+; AVX2-NEXT: addq $136, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_cvt_8f64_to_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: .Ltmp55:
+; AVX512-NEXT: .cfi_def_cfa_offset 16
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: .Ltmp56:
+; AVX512-NEXT: .cfi_def_cfa_offset 24
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: .Ltmp57:
+; AVX512-NEXT: .cfi_def_cfa_offset 32
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: .Ltmp58:
+; AVX512-NEXT: .cfi_def_cfa_offset 40
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: .Ltmp59:
+; AVX512-NEXT: .cfi_def_cfa_offset 48
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: .Ltmp60:
+; AVX512-NEXT: .cfi_def_cfa_offset 56
+; AVX512-NEXT: subq $200, %rsp
+; AVX512-NEXT: .Ltmp61:
+; AVX512-NEXT: .cfi_def_cfa_offset 256
+; AVX512-NEXT: .Ltmp62:
+; AVX512-NEXT: .cfi_offset %rbx, -56
+; AVX512-NEXT: .Ltmp63:
+; AVX512-NEXT: .cfi_offset %r12, -48
+; AVX512-NEXT: .Ltmp64:
+; AVX512-NEXT: .cfi_offset %r13, -40
+; AVX512-NEXT: .Ltmp65:
+; AVX512-NEXT: .cfi_offset %r14, -32
+; AVX512-NEXT: .Ltmp66:
+; AVX512-NEXT: .cfi_offset %r15, -24
+; AVX512-NEXT: .Ltmp67:
+; AVX512-NEXT: .cfi_offset %rbp, -16
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r12d
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r13d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r14d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r15d
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, 12(%rbx)
+; AVX512-NEXT: movw %r15w, 8(%rbx)
+; AVX512-NEXT: movw %r14w, 4(%rbx)
+; AVX512-NEXT: movw %bp, (%rbx)
+; AVX512-NEXT: movw %r13w, 14(%rbx)
+; AVX512-NEXT: movw %r12w, 10(%rbx)
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX512-NEXT: movw %ax, 6(%rbx)
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX512-NEXT: movw %ax, 2(%rbx)
+; AVX512-NEXT: addq $200, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
+ %1 = fptrunc <8 x double> %a0 to <8 x half>
+ %2 = bitcast <8 x half> %1 to <8 x i16>
+ store <8 x i16> %2, <8 x i16>* %a1
+ ret void
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
new file mode 100644
index 000000000000..f344d6dc3cc6
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -0,0 +1,622 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; sdiv by 7
+;
+
+define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_div7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; SSE2-NEXT: imulq %rcx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: movd %rdx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rax
+; SSE2-NEXT: imulq %rcx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: movd %rdx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rax
+; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; SSE41-NEXT: imulq %rcx
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: movd %rdx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rax
+; SSE41-NEXT: imulq %rcx
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: movd %rdx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_div7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = sdiv <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_div7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld $31, %xmm0
+; SSE2-NEXT: psrad $2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: pmuldq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrld $31, %xmm0
+; SSE41-NEXT: psrad $2, %xmm1
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_div7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $15, %xmm1
+; SSE-NEXT: psraw $1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_div7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
+; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_div7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: psubb %xmm2, %xmm0
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: psubb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $7, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
+
+;
+; srem by 7
+;
+
+define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_rem7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: imulq %rsi
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: imulq %rsi
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: leaq (,%rdx,8), %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rcx
+; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: imulq %rsi
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: subq %rax, %rcx
+; SSE41-NEXT: movd %rcx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: imulq %rsi
+; SSE41-NEXT: movq %rdx, %rax
+; SSE41-NEXT: shrq $63, %rax
+; SSE41-NEXT: sarq %rdx
+; SSE41-NEXT: addq %rax, %rdx
+; SSE41-NEXT: leaq (,%rdx,8), %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: subq %rax, %rcx
+; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_rem7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = srem <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_rem7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrld $31, %xmm2
+; SSE2-NEXT: psrad $2, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuldq %xmm2, %xmm3
+; SSE41-NEXT: pmuldq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: paddd %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrld $31, %xmm2
+; SSE41-NEXT: psrad $2, %xmm1
+; SSE41-NEXT: paddd %xmm2, %xmm1
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_rem7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE-NEXT: pmulhw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrlw $15, %xmm2
+; SSE-NEXT: psraw $1, %xmm1
+; SSE-NEXT: paddw %xmm2, %xmm1
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_rem7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
+; AVX-NEXT: vpsraw $1, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_rem7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm2, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm2, %xmm1
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: packuswb %xmm3, %xmm1
+; SSE2-NEXT: paddb %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $2, %xmm2
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: psubb %xmm3, %xmm2
+; SSE2-NEXT: psrlw $7, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: paddb %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm3, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: paddb %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE41-NEXT: pxor %xmm3, %xmm2
+; SSE41-NEXT: psubb %xmm3, %xmm2
+; SSE41-NEXT: psrlw $7, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm3
+; SSE41-NEXT: pmullw %xmm3, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm1, %xmm1
+; SSE41-NEXT: pmullw %xmm3, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm2
+; SSE41-NEXT: psubb %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
new file mode 100644
index 000000000000..cfd2fc625a6c
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -0,0 +1,545 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; sdiv by 7
+;
+
+define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_div7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rax
+; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: imulq %rcx
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: imulq %rcx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_div7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $31, %ymm0, %ymm1
+; AVX2-NEXT: vpsrad $2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_div7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2
+; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm1
+; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_div7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm7, %xmm7
+; AVX1-NEXT: vpmullw %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
+
+;
+; srem by 7
+;
+
+define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_rem7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: imulq %rsi
+; AVX1-NEXT: movq %rdx, %rax
+; AVX1-NEXT: shrq $63, %rax
+; AVX1-NEXT: sarq %rdx
+; AVX1-NEXT: addq %rax, %rdx
+; AVX1-NEXT: leaq (,%rdx,8), %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: subq %rax, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: imulq %rsi
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shrq $63, %rax
+; AVX2-NEXT: sarq %rdx
+; AVX2-NEXT: addq %rax, %rdx
+; AVX2-NEXT: leaq (,%rdx,8), %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: subq %rax, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_rem7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmuldq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm3
+; AVX1-NEXT: vpsrad $2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm4
+; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2
+; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_rem7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm4
+; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm3
+; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2
+; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_rem7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm1
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm9, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm4
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm4, %xmm4
+; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3
+; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsubb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
new file mode 100644
index 000000000000..1bb7181d31df
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -0,0 +1,2392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; sdiv by 7
+;
+
+define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_div7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rax
+; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: imulq %rcx
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = sdiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_div7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: movl %eax, %ecx
+; AVX-NEXT: shrl $31, %ecx
+; AVX-NEXT: sarl $2, %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = sdiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_div7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm3
+; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm2
+; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm1
+; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = sdiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_div7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpxor %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX512BW-NEXT: movsbl %cl, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movl %ecx, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %eax
+; AVX512BW-NEXT: imull $-109, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
+
+;
+; srem by 7
+;
+
+define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_rem7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: imulq %rsi
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $63, %rax
+; AVX-NEXT: sarq %rdx
+; AVX-NEXT: addq %rax, %rdx
+; AVX-NEXT: leaq (,%rdx,8), %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: subq %rax, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = srem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_rem7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: movl %edx, %esi
+; AVX-NEXT: shrl $31, %esi
+; AVX-NEXT: sarl $2, %edx
+; AVX-NEXT: addl %esi, %edx
+; AVX-NEXT: leal (,%rdx,8), %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: subl %esi, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: cltq
+; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: addl %eax, %ecx
+; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shrl $31, %edx
+; AVX-NEXT: sarl $2, %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: leal (,%rcx,8), %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: subl %edx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_rem7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4
+; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3
+; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = srem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_rem7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm0, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $7, %ymm4, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %ymm10, %ymm6, %ymm8
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpxor %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpsubb %ymm7, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm8, %ymm4, %ymm8
+; AVX512F-NEXT: vpmovsxbw %xmm8, %ymm9
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm4
+; AVX512F-NEXT: vpmullw %ymm4, %ymm9, %ymm9
+; AVX512F-NEXT: vpmovsxwd %ymm9, %zmm9
+; AVX512F-NEXT: vpmovdb %zmm9, %xmm9
+; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm5
+; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovsxwd %ymm5, %zmm5
+; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm5
+; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
+; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm5
+; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],ymm2[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
+; AVX512F-NEXT: vpand %ymm10, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm3
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movb $7, %dil
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %esi
+; AVX512BW-NEXT: imull $-109, %esi, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %esi
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %esi
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %esi
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %edx
+; AVX512BW-NEXT: imull $-109, %edx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrb $7, %cl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512BW-NEXT: movsbl %al, %ecx
+; AVX512BW-NEXT: imull $-109, %ecx, %eax
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: movl %eax, %edx
+; AVX512BW-NEXT: shrb $7, %dl
+; AVX512BW-NEXT: sarb $2, %al
+; AVX512BW-NEXT: addb %dl, %al
+; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: mulb %dil
+; AVX512BW-NEXT: subb %al, %cl
+; AVX512BW-NEXT: movzbl %cl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
new file mode 100644
index 000000000000..1e68dc9170bf
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -0,0 +1,592 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; udiv by 7
+;
+
+define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_div7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: addq %rdx, %rcx
+; SSE2-NEXT: shrq $2, %rcx
+; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: shrq %rcx
+; SSE2-NEXT: addq %rdx, %rcx
+; SSE2-NEXT: shrq $2, %rcx
+; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rcx
+; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: shrq %rcx
+; SSE41-NEXT: addq %rdx, %rcx
+; SSE41-NEXT: shrq $2, %rcx
+; SSE41-NEXT: movd %rcx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: shrq %rcx
+; SSE41-NEXT: addq %rdx, %rcx
+; SSE41-NEXT: shrq $2, %rcx
+; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_div7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = udiv <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_div7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: psrld $2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: psrld $1, %xmm0
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: psrld $2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_div7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: psubw %xmm1, %xmm0
+; SSE-NEXT: psrlw $1, %xmm0
+; SSE-NEXT: paddw %xmm1, %xmm0
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_div7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_div7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm1, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: psubb %xmm3, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: paddb %xmm3, %xmm0
+; SSE2-NEXT: psrlw $2, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_div7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: psubb %xmm1, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: paddb %xmm1, %xmm0
+; SSE41-NEXT: psrlw $2, %xmm0
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_div7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
+
+;
+; urem by 7
+;
+
+define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
+; SSE2-LABEL: test_rem7_2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: addq %rdx, %rax
+; SSE2-NEXT: shrq $2, %rax
+; SSE2-NEXT: leaq (,%rax,8), %rdx
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: movd %rcx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movd %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rsi
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: addq %rdx, %rax
+; SSE2-NEXT: shrq $2, %rax
+; SSE2-NEXT: leaq (,%rax,8), %rdx
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: subq %rdx, %rcx
+; SSE2-NEXT: movd %rcx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_2i64:
+; SSE41: # BB#0:
+; SSE41-NEXT: pextrq $1, %xmm0, %rcx
+; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: shrq %rax
+; SSE41-NEXT: addq %rdx, %rax
+; SSE41-NEXT: shrq $2, %rax
+; SSE41-NEXT: leaq (,%rax,8), %rdx
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: movd %rcx, %xmm1
+; SSE41-NEXT: movd %xmm0, %rcx
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: mulq %rsi
+; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: shrq %rax
+; SSE41-NEXT: addq %rdx, %rax
+; SSE41-NEXT: shrq $2, %rax
+; SSE41-NEXT: leaq (,%rax,8), %rdx
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: subq %rdx, %rcx
+; SSE41-NEXT: movd %rcx, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_rem7_2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %res = urem <2 x i64> %a, <i64 7, i64 7>
+ ret <2 x i64> %res
+}
+
+define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
+; SSE2-LABEL: test_rem7_4i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubd %xmm2, %xmm1
+; SSE2-NEXT: psrld $1, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: psrld $2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_4i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm2, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: psrld $1, %xmm2
+; SSE41-NEXT: paddd %xmm1, %xmm2
+; SSE41-NEXT: psrld $2, %xmm2
+; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2
+; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = urem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %res
+}
+
+define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_rem7_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT: pmulhuw %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psubw %xmm1, %xmm2
+; SSE-NEXT: psrlw $1, %xmm2
+; SSE-NEXT: paddw %xmm1, %xmm2
+; SSE-NEXT: psrlw $2, %xmm2
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubw %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_rem7_8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res = urem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
+; SSE2-LABEL: test_rem7_16i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; SSE2-NEXT: psrlw $8, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: pmullw %xmm1, %xmm2
+; SSE2-NEXT: psrlw $8, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm1, %xmm3
+; SSE2-NEXT: psrlw $8, %xmm3
+; SSE2-NEXT: packuswb %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psubb %xmm3, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: paddb %xmm3, %xmm1
+; SSE2-NEXT: psrlw $2, %xmm1
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: psraw $8, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; SSE2-NEXT: psraw $8, %xmm3
+; SSE2-NEXT: pmullw %xmm3, %xmm2
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: psraw $8, %xmm1
+; SSE2-NEXT: pmullw %xmm3, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: psubb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_rem7_16i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm1
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: psrlw $8, %xmm3
+; SSE41-NEXT: packuswb %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psubb %xmm1, %xmm2
+; SSE41-NEXT: psrlw $1, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: paddb %xmm1, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm1
+; SSE41-NEXT: pmovsxbw {{.*}}(%rip), %xmm3
+; SSE41-NEXT: pmullw %xmm3, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE41-NEXT: pmovsxbw %xmm2, %xmm2
+; SSE41-NEXT: pmullw %xmm3, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm1
+; SSE41-NEXT: psubb %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_rem7_16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm3
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %res = urem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
new file mode 100644
index 000000000000..a1d356a0e762
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -0,0 +1,551 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+;
+; udiv by 7
+;
+
+define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_div7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: shrq %rcx
+; AVX1-NEXT: addq %rdx, %rcx
+; AVX1-NEXT: shrq $2, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: addq %rdx, %rcx
+; AVX2-NEXT: shrq $2, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_div7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_div7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_div7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT: vpmullw %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_div7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
+
+;
+; urem by 7
+;
+
+define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
+; AVX1-LABEL: test_rem7_4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: mulq %rsi
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: shrq %rax
+; AVX1-NEXT: addq %rdx, %rax
+; AVX1-NEXT: shrq $2, %rax
+; AVX1-NEXT: leaq (,%rax,8), %rdx
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: subq %rdx, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: mulq %rsi
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: shrq %rax
+; AVX2-NEXT: addq %rdx, %rax
+; AVX2-NEXT: shrq $2, %rax
+; AVX2-NEXT: leaq (,%rax,8), %rdx
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: subq %rdx, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
+ ret <4 x i64> %res
+}
+
+define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
+; AVX1-LABEL: test_rem7_8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm3
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrld $2, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7]
+; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrld $2, %xmm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: test_rem7_16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7]
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <16 x i16> %res
+}
+
+define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
+; AVX1-LABEL: test_rem7_32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX1-NEXT: vpand %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm6
+; AVX1-NEXT: vpmovsxbw {{.*}}(%rip), %xmm7
+; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm3
+; AVX1-NEXT: vpmullw %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxbw %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_rem7_32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %res = urem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <32 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-512.ll b/test/CodeGen/X86/vector-idiv-udiv-512.ll
new file mode 100644
index 000000000000..35c902c5cc21
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -0,0 +1,2100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; udiv by 7
+;
+
+define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_div7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: shrq %rcx
+; AVX-NEXT: addq %rdx, %rcx
+; AVX-NEXT: shrq $2, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = udiv <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_div7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: shrl %ecx
+; AVX-NEXT: addl %edx, %ecx
+; AVX-NEXT: shrl $2, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: shrl %eax
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: shrl $2, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = udiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_div7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = udiv <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_div7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm3[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
+; AVX512BW-NEXT: imull $37, %ecx, %edx
+; AVX512BW-NEXT: shrl $8, %edx
+; AVX512BW-NEXT: subb %dl, %cl
+; AVX512BW-NEXT: shrb %cl
+; AVX512BW-NEXT: addb %dl, %cl
+; AVX512BW-NEXT: shrb $2, %cl
+; AVX512BW-NEXT: movzbl %cl, %ecx
+; AVX512BW-NEXT: vmovd %ecx, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
+; AVX512BW-NEXT: imull $37, %eax, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movzbl %al, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = udiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
+
+;
+; urem by 7
+;
+
+define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
+; AVX-LABEL: test_rem7_8i64:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vmovq %xmm1, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm2, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: mulq %rsi
+; AVX-NEXT: movq %rcx, %rax
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: shrq %rax
+; AVX-NEXT: addq %rdx, %rax
+; AVX-NEXT: shrq $2, %rax
+; AVX-NEXT: leaq (,%rax,8), %rdx
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: subq %rdx, %rcx
+; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = urem <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
+; AVX-LABEL: test_rem7_16i32:
+; AVX: # BB#0:
+; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX-NEXT: vpextrd $1, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm1, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm2
+; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $2, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX-NEXT: vpextrd $3, %xmm1, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm2, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm2, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vmovd %xmm0, %ecx
+; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rdx
+; AVX-NEXT: movl %ecx, %esi
+; AVX-NEXT: subl %edx, %esi
+; AVX-NEXT: shrl %esi
+; AVX-NEXT: addl %edx, %esi
+; AVX-NEXT: shrl $2, %esi
+; AVX-NEXT: leal (,%rsi,8), %edx
+; AVX-NEXT: subl %esi, %edx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: vmovd %ecx, %xmm3
+; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
+; AVX-NEXT: shrq $32, %rcx
+; AVX-NEXT: movl %eax, %edx
+; AVX-NEXT: subl %ecx, %edx
+; AVX-NEXT: shrl %edx
+; AVX-NEXT: addl %ecx, %edx
+; AVX-NEXT: shrl $2, %edx
+; AVX-NEXT: leal (,%rdx,8), %ecx
+; AVX-NEXT: subl %edx, %ecx
+; AVX-NEXT: subl %ecx, %eax
+; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX-NEXT: retq
+ %res = urem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i32> %res
+}
+
+define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
+; AVX512F-LABEL: test_rem7_32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = urem <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
+; AVX512F-LABEL: test_rem7_64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm5
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm5[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm5
+; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
+; AVX512F-NEXT: vpaddb %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7
+; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm8
+; AVX512F-NEXT: vpmovsxbw {{.*}}(%rip), %ymm3
+; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm8
+; AVX512F-NEXT: vpmovsxwd %ymm8, %zmm8
+; AVX512F-NEXT: vpmovdb %zmm8, %xmm8
+; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7
+; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm7
+; AVX512F-NEXT: vpmovsxwd %ymm7, %zmm7
+; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
+; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm7
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmullw %ymm4, %ymm7, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm2[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrb $1, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %ecx
+; AVX512BW-NEXT: shrl $8, %ecx
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %cl, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %cl, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: movb $7, %cl
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm1, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm2
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $2, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $3, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $4, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $5, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $6, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $7, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $8, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $9, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $10, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $11, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $12, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $13, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $14, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpextrb $15, %xmm1, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm2, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
+; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %edx
+; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
+; AVX512BW-NEXT: imull $37, %esi, %edi
+; AVX512BW-NEXT: shrl $8, %edi
+; AVX512BW-NEXT: movl %esi, %eax
+; AVX512BW-NEXT: subb %dil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %dil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %sil
+; AVX512BW-NEXT: movzbl %sil, %eax
+; AVX512BW-NEXT: vmovd %eax, %xmm3
+; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $3, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $4, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $7, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $11, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $12, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrb $15, %xmm0, %edx
+; AVX512BW-NEXT: imull $37, %edx, %esi
+; AVX512BW-NEXT: shrl $8, %esi
+; AVX512BW-NEXT: movl %edx, %eax
+; AVX512BW-NEXT: subb %sil, %al
+; AVX512BW-NEXT: shrb %al
+; AVX512BW-NEXT: addb %sil, %al
+; AVX512BW-NEXT: shrb $2, %al
+; AVX512BW-NEXT: mulb %cl
+; AVX512BW-NEXT: subb %al, %dl
+; AVX512BW-NEXT: movzbl %dl, %eax
+; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = urem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index 1117e206e5b0..6719a66f030f 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -1,1212 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-target triple = "x86_64-unknown-unknown"
-
-define <4 x i32> @test1(<4 x i32> %a) #0 {
-; SSE41-LABEL: test1:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm2, %xmm3
-; SSE41-NEXT: pmuludq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: psubd %xmm1, %xmm0
-; SSE41-NEXT: psrld $1, %xmm0
-; SSE41-NEXT: paddd %xmm1, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test1:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: psubd %xmm2, %xmm0
-; SSE-NEXT: psrld $1, %xmm0
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: psrld $2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test1:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
- ret <4 x i32> %div
-}
-
-define <8 x i32> @test2(<8 x i32> %a) #0 {
-; SSE41-LABEL: test2:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmuludq %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; SSE41-NEXT: psubd %xmm5, %xmm0
-; SSE41-NEXT: psrld $1, %xmm0
-; SSE41-NEXT: paddd %xmm5, %xmm0
-; SSE41-NEXT: psrld $2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: pmuludq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: psrld $1, %xmm1
-; SSE41-NEXT: paddd %xmm2, %xmm1
-; SSE41-NEXT: psrld $2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test2:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE-NEXT: psubd %xmm3, %xmm0
-; SSE-NEXT: psrld $1, %xmm0
-; SSE-NEXT: paddd %xmm3, %xmm0
-; SSE-NEXT: psrld $2, %xmm0
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm2, %xmm1
-; SSE-NEXT: psrld $1, %xmm1
-; SSE-NEXT: paddd %xmm2, %xmm1
-; SSE-NEXT: psrld $2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test2:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrld $1, %ymm0, %ymm0
-; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrld $2, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %div
-}
-
-define <8 x i16> @test3(<8 x i16> %a) #0 {
-; SSE41-LABEL: test3:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: psubw %xmm1, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: psrlw $2, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test3:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psubw %xmm1, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: psrlw $2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test3:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
- ret <8 x i16> %div
-}
-
-define <16 x i16> @test4(<16 x i16> %a) #0 {
-; SSE41-LABEL: test4:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pmulhuw %xmm2, %xmm3
-; SSE41-NEXT: psubw %xmm3, %xmm0
-; SSE41-NEXT: psrlw $1, %xmm0
-; SSE41-NEXT: paddw %xmm3, %xmm0
-; SSE41-NEXT: psrlw $2, %xmm0
-; SSE41-NEXT: pmulhuw %xmm1, %xmm2
-; SSE41-NEXT: psubw %xmm2, %xmm1
-; SSE41-NEXT: psrlw $1, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test4:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmulhuw %xmm2, %xmm3
-; SSE-NEXT: psubw %xmm3, %xmm0
-; SSE-NEXT: psrlw $1, %xmm0
-; SSE-NEXT: paddw %xmm3, %xmm0
-; SSE-NEXT: psrlw $2, %xmm0
-; SSE-NEXT: pmulhuw %xmm1, %xmm2
-; SSE-NEXT: psubw %xmm2, %xmm1
-; SSE-NEXT: psrlw $1, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: psrlw $2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test4:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
-; AVX-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
- ret <16 x i16> %div
-}
-
-define <8 x i16> @test5(<8 x i16> %a) #0 {
-; SSE41-LABEL: test5:
-; SSE41: # BB#0:
-; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $15, %xmm1
-; SSE41-NEXT: psraw $1, %xmm0
-; SSE41-NEXT: paddw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test5:
-; SSE: # BB#0:
-; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $15, %xmm1
-; SSE-NEXT: psraw $1, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test5:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
-; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
- ret <8 x i16> %div
-}
-
-define <16 x i16> @test6(<16 x i16> %a) #0 {
-; SSE41-LABEL: test6:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
-; SSE41-NEXT: pmulhw %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlw $15, %xmm3
-; SSE41-NEXT: psraw $1, %xmm0
-; SSE41-NEXT: paddw %xmm3, %xmm0
-; SSE41-NEXT: pmulhw %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $15, %xmm2
-; SSE41-NEXT: psraw $1, %xmm1
-; SSE41-NEXT: paddw %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test6:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
-; SSE-NEXT: pmulhw %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlw $15, %xmm3
-; SSE-NEXT: psraw $1, %xmm0
-; SSE-NEXT: paddw %xmm3, %xmm0
-; SSE-NEXT: pmulhw %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrlw $15, %xmm2
-; SSE-NEXT: psraw $1, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test6:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vpsrlw $15, %ymm0, %ymm1
-; AVX-NEXT: vpsraw $1, %ymm0, %ymm0
-; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
- ret <16 x i16> %div
-}
-
-define <16 x i8> @test7(<16 x i8> %a) #0 {
-; SSE41-LABEL: test7:
-; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pextrb $0, %xmm0, %ecx
-; SSE41-NEXT: movsbl %cl, %ecx
-; SSE41-NEXT: imull $-109, %ecx, %edx
-; SSE41-NEXT: shrl $8, %edx
-; SSE41-NEXT: addb %dl, %cl
-; SSE41-NEXT: movb %cl, %dl
-; SSE41-NEXT: shrb $7, %dl
-; SSE41-NEXT: sarb $2, %cl
-; SSE41-NEXT: addb %dl, %cl
-; SSE41-NEXT: movzbl %cl, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrb $1, %eax, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $2, %eax, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $3, %eax, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $4, %eax, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $5, %eax, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $6, %eax, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $7, %eax, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $8, %eax, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $9, %eax, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $10, %eax, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $11, %eax, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $12, %eax, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $13, %eax, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $14, %eax, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %eax
-; SSE41-NEXT: movsbl %al, %eax
-; SSE41-NEXT: imull $-109, %eax, %ecx
-; SSE41-NEXT: shrl $8, %ecx
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movb %al, %cl
-; SSE41-NEXT: shrb $7, %cl
-; SSE41-NEXT: sarb $2, %al
-; SSE41-NEXT: addb %cl, %al
-; SSE41-NEXT: movzbl %al, %eax
-; SSE41-NEXT: pinsrb $15, %eax, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test7:
-; SSE: # BB#0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: imull $-109, %eax, %ecx
-; SSE-NEXT: shrl $8, %ecx
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movb %cl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %cl
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movzbl %cl, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r14d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edx
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r9d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r11d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r8d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi
-; SSE-NEXT: imull $-109, %esi, %edi
-; SSE-NEXT: shrl $8, %edi
-; SSE-NEXT: addb %sil, %dil
-; SSE-NEXT: movb %dil, %bl
-; SSE-NEXT: shrb $7, %bl
-; SSE-NEXT: sarb $2, %dil
-; SSE-NEXT: addb %bl, %dil
-; SSE-NEXT: movzbl %dil, %esi
-; SSE-NEXT: movd %esi, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: imull $-109, %eax, %esi
-; SSE-NEXT: shrl $8, %esi
-; SSE-NEXT: addb %al, %sil
-; SSE-NEXT: movb %sil, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %sil
-; SSE-NEXT: addb %al, %sil
-; SSE-NEXT: movzbl %sil, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ebp
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %esi
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %r10d
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %edi
-; SSE-NEXT: imull $-109, %edi, %ebx
-; SSE-NEXT: shrl $8, %ebx
-; SSE-NEXT: addb %dil, %bl
-; SSE-NEXT: movb %bl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %bl
-; SSE-NEXT: addb %al, %bl
-; SSE-NEXT: movzbl %bl, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: imull $-109, %edx, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: imull $-109, %esi, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %sil, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT: imull $-109, %ecx, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: imull $-109, %eax, %edx
-; SSE-NEXT: shrl $8, %edx
-; SSE-NEXT: addb %al, %dl
-; SSE-NEXT: movb %dl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %dl
-; SSE-NEXT: addb %al, %dl
-; SSE-NEXT: movzbl %dl, %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: imull $-109, %r14d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r14b, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: imull $-109, %ebp, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %bpl, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: imull $-109, %r11d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r11b, %al
-; SSE-NEXT: movb %al, %dl
-; SSE-NEXT: shrb $7, %dl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %dl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: imull $-109, %ecx, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: imull $-109, %r9d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r9b, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: imull $-109, %r10d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r10b, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: imull $-109, %r8d, %eax
-; SSE-NEXT: shrl $8, %eax
-; SSE-NEXT: addb %r8b, %al
-; SSE-NEXT: movb %al, %cl
-; SSE-NEXT: shrb $7, %cl
-; SSE-NEXT: sarb $2, %al
-; SSE-NEXT: addb %cl, %al
-; SSE-NEXT: movzbl %al, %eax
-; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: imull $-109, %eax, %ecx
-; SSE-NEXT: shrl $8, %ecx
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movb %cl, %al
-; SSE-NEXT: shrb $7, %al
-; SSE-NEXT: sarb $2, %cl
-; SSE-NEXT: addb %al, %cl
-; SSE-NEXT: movzbl %cl, %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test7:
-; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: movsbl %al, %eax
-; AVX-NEXT: imull $-109, %eax, %ecx
-; AVX-NEXT: shrl $8, %ecx
-; AVX-NEXT: addb %cl, %al
-; AVX-NEXT: movb %al, %cl
-; AVX-NEXT: shrb $7, %cl
-; AVX-NEXT: sarb $2, %al
-; AVX-NEXT: addb %cl, %al
-; AVX-NEXT: movzbl %al, %eax
-; AVX-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %dl
-; AVX-NEXT: shrb $7, %dl
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movzbl %cl, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX-NEXT: movsbl %cl, %ecx
-; AVX-NEXT: imull $-109, %ecx, %edx
-; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0
-; AVX-NEXT: shrl $8, %edx
-; AVX-NEXT: addb %dl, %cl
-; AVX-NEXT: movb %cl, %al
-; AVX-NEXT: shrb $7, %al
-; AVX-NEXT: sarb $2, %cl
-; AVX-NEXT: addb %al, %cl
-; AVX-NEXT: movzbl %cl, %eax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
- ret <16 x i8> %div
-}
-
-define <4 x i32> @test8(<4 x i32> %a) #0 {
-; SSE41-LABEL: test8:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm2, %xmm3
-; SSE41-NEXT: pmuldq %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: psrad $2, %xmm1
-; SSE41-NEXT: paddd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test8:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pand %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: psubd %xmm2, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $31, %xmm0
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test8:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrld $31, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
- ret <4 x i32> %div
-}
-
-define <8 x i32> @test9(<8 x i32> %a) #0 {
-; SSE41-LABEL: test9:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: pmuldq %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: psrad $2, %xmm2
-; SSE41-NEXT: paddd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm4, %xmm0
-; SSE41-NEXT: pmuldq %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5],xmm0[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: psrld $31, %xmm0
-; SSE41-NEXT: psrad $2, %xmm3
-; SSE41-NEXT: paddd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test9:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: psrad $31, %xmm5
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: paddd %xmm0, %xmm5
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pmuludq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm6, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE-NEXT: psubd %xmm5, %xmm0
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrld $31, %xmm2
-; SSE-NEXT: psrad $2, %xmm0
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: psrad $31, %xmm5
-; SSE-NEXT: pand %xmm3, %xmm5
-; SSE-NEXT: paddd %xmm4, %xmm5
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm6, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm5, %xmm2
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: psrld $31, %xmm1
-; SSE-NEXT: psrad $2, %xmm2
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test9:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vpsrld $31, %ymm0, %ymm1
-; AVX-NEXT: vpsrad $2, %ymm0, %ymm0
-; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %div
-}
-
-define <8 x i32> @test10(<8 x i32> %a) #0 {
-; SSE41-LABEL: test10:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmuludq %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psubd %xmm5, %xmm4
-; SSE41-NEXT: psrld $1, %xmm4
-; SSE41-NEXT: paddd %xmm5, %xmm4
-; SSE41-NEXT: psrld $2, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7]
-; SSE41-NEXT: pmulld %xmm5, %xmm4
-; SSE41-NEXT: psubd %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm3, %xmm4
-; SSE41-NEXT: pmuludq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psubd %xmm2, %xmm3
-; SSE41-NEXT: psrld $1, %xmm3
-; SSE41-NEXT: paddd %xmm2, %xmm3
-; SSE41-NEXT: psrld $2, %xmm3
-; SSE41-NEXT: pmulld %xmm5, %xmm3
-; SSE41-NEXT: psubd %xmm3, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test10:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [613566757,613566757,613566757,613566757]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pmuludq %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: psubd %xmm2, %xmm5
-; SSE-NEXT: psrld $1, %xmm5
-; SSE-NEXT: paddd %xmm2, %xmm5
-; SSE-NEXT: psrld $2, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; SSE-NEXT: psubd %xmm5, %xmm0
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psubd %xmm3, %xmm4
-; SSE-NEXT: psrld $1, %xmm4
-; SSE-NEXT: paddd %xmm3, %xmm4
-; SSE-NEXT: psrld $2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: psubd %xmm4, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test10:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2
-; AVX-NEXT: vpsrld $1, %ymm2, %ymm2
-; AVX-NEXT: vpaddd %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vpsrld $2, %ymm1, %ymm1
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %rem
-}
-
-define <8 x i32> @test11(<8 x i32> %a) #0 {
-; SSE41-LABEL: test11:
-; SSE41: # BB#0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmuldq %xmm2, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; SSE41-NEXT: paddd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: psrld $31, %xmm4
-; SSE41-NEXT: psrad $2, %xmm5
-; SSE41-NEXT: paddd %xmm4, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7]
-; SSE41-NEXT: pmulld %xmm4, %xmm5
-; SSE41-NEXT: psubd %xmm5, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuldq %xmm3, %xmm5
-; SSE41-NEXT: pmuldq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
-; SSE41-NEXT: paddd %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrld $31, %xmm3
-; SSE41-NEXT: psrad $2, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: pmulld %xmm4, %xmm2
-; SSE41-NEXT: psubd %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test11:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: psrad $31, %xmm6
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: paddd %xmm4, %xmm6
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT: psubd %xmm6, %xmm7
-; SSE-NEXT: paddd %xmm0, %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm4
-; SSE-NEXT: psrld $31, %xmm4
-; SSE-NEXT: psrad $2, %xmm7
-; SSE-NEXT: paddd %xmm4, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE-NEXT: psubd %xmm7, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: psrad $31, %xmm6
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: paddd %xmm3, %xmm6
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm6, %xmm2
-; SSE-NEXT: paddd %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psrld $31, %xmm3
-; SSE-NEXT: psrad $2, %xmm2
-; SSE-NEXT: paddd %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE-NEXT: psubd %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test11:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
-; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1
-; AVX-NEXT: vpsrld $31, %ymm1, %ymm2
-; AVX-NEXT: vpsrad $2, %ymm1, %ymm1
-; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1
-; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
- %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
- ret <8 x i32> %rem
-}
-
-define <2 x i16> @test12() #0 {
-; SSE41-LABEL: test12:
-; SSE41: # BB#0:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; SSE-LABEL: test12:
+define <2 x i16> @test_urem_unary_v2i16() nounwind {
+; SSE-LABEL: test_urem_unary_v2i16:
; SSE: # BB#0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test12:
+; AVX-LABEL: test_urem_unary_v2i16:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1216,7 +20,30 @@ define <2 x i16> @test12() #0 {
ret <2 x i16> %B9
}
-define <4 x i32> @PR20355(<4 x i32> %a) #0 {
+define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
+; SSE2-LABEL: PR20355:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: paddd %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE2-NEXT: psubd %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: psrld $31, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
; SSE41-LABEL: PR20355:
; SSE41: # BB#0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
@@ -1231,44 +58,32 @@ define <4 x i32> @PR20355(<4 x i32> %a) #0 {
; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; SSE-LABEL: PR20355:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrad $31, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: paddd %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: psubd %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm0
-; SSE-NEXT: psrld $31, %xmm0
-; SSE-NEXT: paddd %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: PR20355:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX-NEXT: vpsrld $31, %xmm0, %xmm1
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: PR20355:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR20355:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
entry:
%sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
ret <4 x i32> %sdiv
}
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index 8bf0af68e6dc..06c785575339 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1,11 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VLCD --check-prefix=ALL --check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=AVX512CD --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
@@ -101,8 +105,40 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv2i64:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pushl %esi
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: movl $63, %ecx
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: pextrd $3, %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %esi
+; X32-SSE-NEXT: xorl $31, %esi
+; X32-SSE-NEXT: testl %edx, %edx
+; X32-SSE-NEXT: cmovel %eax, %esi
+; X32-SSE-NEXT: movd %esi, %xmm1
+; X32-SSE-NEXT: movd %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: pextrd $1, %xmm0, %ecx
+; X32-SSE-NEXT: bsrl %ecx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: testl %ecx, %ecx
+; X32-SSE-NEXT: cmovel %eax, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0)
ret <2 x i64> %out
@@ -187,8 +223,35 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv2i64u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %ecx
+; X32-SSE-NEXT: xorl $31, %ecx
+; X32-SSE-NEXT: pextrd $2, %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm1
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %ecx
+; X32-SSE-NEXT: xorl $31, %ecx
+; X32-SSE-NEXT: movd %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1)
ret <2 x i64> %out
@@ -349,8 +412,36 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv4i32:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: movl $63, %ecx
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: movd %xmm0, %edx
+; X32-SSE-NEXT: bsrl %edx, %edx
+; X32-SSE-NEXT: cmovel %ecx, %edx
+; X32-SSE-NEXT: xorl $31, %edx
+; X32-SSE-NEXT: movd %edx, %xmm1
+; X32-SSE-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $2, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0)
ret <4 x i32> %out
@@ -486,8 +577,31 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv4i32u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: movd %xmm0, %ecx
+; X32-SSE-NEXT: bsrl %ecx, %ecx
+; X32-SSE-NEXT: xorl $31, %ecx
+; X32-SSE-NEXT: movd %ecx, %xmm1
+; X32-SSE-NEXT: pinsrd $1, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $2, %eax, %xmm1
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsrl %eax, %eax
+; X32-SSE-NEXT: xorl $31, %eax
+; X32-SSE-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1)
ret <4 x i32> %out
@@ -600,150 +714,75 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %cx
-; SSSE3-NEXT: movw $31, %ax
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: pextrw $3, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: pextrw $5, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: pextrw $1, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: pextrw $6, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: pextrw $2, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT: pextrw $4, %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movd %xmm0, %ecx
-; SSSE3-NEXT: bsrw %cx, %cx
-; SSSE3-NEXT: cmovew %ax, %cx
-; SSSE3-NEXT: xorl $15, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: paddb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm1
+; SSSE3-NEXT: paddw %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrw $1, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %cx
-; SSE41-NEXT: movw $31, %ax
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: movd %xmm0, %edx
-; SSE41-NEXT: bsrw %dx, %dx
-; SSE41-NEXT: cmovew %ax, %dx
-; SSE41-NEXT: xorl $15, %edx
-; SSE41-NEXT: movd %edx, %xmm1
-; SSE41-NEXT: pinsrw $1, %ecx, %xmm1
-; SSE41-NEXT: pextrw $2, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $2, %ecx, %xmm1
-; SSE41-NEXT: pextrw $3, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $3, %ecx, %xmm1
-; SSE41-NEXT: pextrw $4, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $4, %ecx, %xmm1
-; SSE41-NEXT: pextrw $5, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $5, %ecx, %xmm1
-; SSE41-NEXT: pextrw $6, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $6, %ecx, %xmm1
-; SSE41-NEXT: pextrw $7, %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: cmovew %ax, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: pinsrw $7, %ecx, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16:
; AVX: # BB#0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %cx
-; AVX-NEXT: movw $31, %ax
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vmovd %xmm0, %edx
-; AVX-NEXT: bsrw %dx, %dx
-; AVX-NEXT: cmovew %ax, %dx
-; AVX-NEXT: xorl $15, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: cmovew %ax, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %xmm0, %ymm0
+; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
@@ -756,6 +795,30 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: paddb %xmm3, %xmm1
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: paddw %xmm0, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0)
ret <8 x i16> %out
}
@@ -849,123 +912,75 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
;
; SSSE3-LABEL: testv8i16u:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pextrw $7, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: pextrw $3, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: pextrw $5, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: pextrw $1, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: pextrw $6, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: pextrw $2, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSSE3-NEXT: pextrw $4, %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: movd %xmm0, %eax
-; SSSE3-NEXT: bsrw %ax, %ax
-; SSSE3-NEXT: xorl $15, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: psrlw $4, %xmm1
+; SSSE3-NEXT: pand %xmm2, %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: paddb %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm0
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $8, %xmm1
+; SSSE3-NEXT: paddw %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16u:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrw $1, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: movd %xmm0, %ecx
-; SSE41-NEXT: bsrw %cx, %cx
-; SSE41-NEXT: xorl $15, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrw $1, %eax, %xmm1
-; SSE41-NEXT: pextrw $2, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $2, %eax, %xmm1
-; SSE41-NEXT: pextrw $3, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $3, %eax, %xmm1
-; SSE41-NEXT: pextrw $4, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $4, %eax, %xmm1
-; SSE41-NEXT: pextrw $5, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $5, %eax, %xmm1
-; SSE41-NEXT: pextrw $6, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $6, %eax, %xmm1
-; SSE41-NEXT: pextrw $7, %xmm0, %eax
-; SSE41-NEXT: bsrw %ax, %ax
-; SSE41-NEXT: xorl $15, %eax
-; SSE41-NEXT: pinsrw $7, %eax, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pshufb %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE41-NEXT: pand %xmm4, %xmm1
+; SSE41-NEXT: paddb %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: psrlw $8, %xmm1
+; SSE41-NEXT: paddw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16u:
; AVX: # BB#0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: bsrw %cx, %cx
-; AVX-NEXT: xorl $15, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $4, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $5, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $6, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $7, %xmm0, %eax
-; AVX-NEXT: bsrw %ax, %ax
-; AVX-NEXT: xorl $15, %eax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4
+; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5
+; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16u:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %xmm0, %ymm0
+; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
@@ -978,6 +993,30 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm1, %xmm4
+; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
+; X32-SSE-NEXT: pand %xmm4, %xmm1
+; X32-SSE-NEXT: paddb %xmm3, %xmm1
+; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: pand %xmm1, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm1
+; X32-SSE-NEXT: paddw %xmm0, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1)
ret <8 x i16> %out
}
@@ -1195,295 +1234,80 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pushq %rbp
-; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %ecx
-; SSSE3-NEXT: movl $15, %eax
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %edx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
-; SSSE3-NEXT: bsrl %ebp, %ebp
-; SSSE3-NEXT: cmovel %eax, %ebp
-; SSSE3-NEXT: xorl $7, %ebp
-; SSSE3-NEXT: movd %ebp, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: bsrl %edi, %edi
-; SSSE3-NEXT: cmovel %eax, %edi
-; SSSE3-NEXT: xorl $7, %edi
-; SSSE3-NEXT: movd %edi, %xmm1
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: bsrl %esi, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %ebx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %edx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r11d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %esi, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSSE3-NEXT: bsrl %r9d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: bsrl %r10d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r8d, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: cmovel %eax, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: popq %rbx
-; SSSE3-NEXT: popq %rbp
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %ecx
-; SSE41-NEXT: movl $15, %eax
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pextrb $0, %xmm0, %edx
-; SSE41-NEXT: bsrl %edx, %edx
-; SSE41-NEXT: cmovel %eax, %edx
-; SSE41-NEXT: xorl $7, %edx
-; SSE41-NEXT: movd %edx, %xmm1
-; SSE41-NEXT: pinsrb $1, %ecx, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $2, %ecx, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $3, %ecx, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $4, %ecx, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $5, %ecx, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $6, %ecx, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $7, %ecx, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $8, %ecx, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $9, %ecx, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $10, %ecx, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $11, %ecx, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $12, %ecx, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $13, %ecx, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $14, %ecx, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: cmovel %eax, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: pinsrb $15, %ecx, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pshufb %xmm3, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %ecx
-; AVX-NEXT: movl $15, %eax
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpextrb $0, %xmm0, %edx
-; AVX-NEXT: bsrl %edx, %edx
-; AVX-NEXT: cmovel %eax, %edx
-; AVX-NEXT: xorl $7, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: cmovel %eax, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i8:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VLCD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i8:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i8:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm4, %xmm2
+; X32-SSE-NEXT: pshufb %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
ret <16 x i8> %out
}
@@ -1663,242 +1487,80 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
;
; SSSE3-LABEL: testv16i8u:
; SSSE3: # BB#0:
-; SSSE3-NEXT: pushq %rbx
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: bsrl %esi, %esi
-; SSSE3-NEXT: xorl $7, %esi
-; SSSE3-NEXT: movd %esi, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
-; SSSE3-NEXT: bsrl %ebx, %ebx
-; SSSE3-NEXT: xorl $7, %ebx
-; SSSE3-NEXT: movd %ebx, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: bsrl %edx, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: bsrl %esi, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: bsrl %ecx, %ecx
-; SSSE3-NEXT: xorl $7, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: bsrl %edx, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: bsrl %edi, %edx
-; SSSE3-NEXT: xorl $7, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r10d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: bsrl %ecx, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: bsrl %r9d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: bsrl %r11d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: bsrl %r8d, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: bsrl %eax, %eax
-; SSSE3-NEXT: xorl $7, %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: popq %rbx
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm3, %xmm4
+; SSSE3-NEXT: psrlw $4, %xmm0
+; SSSE3-NEXT: pand %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: paddb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8u:
; SSE41: # BB#0:
-; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pextrb $0, %xmm0, %ecx
-; SSE41-NEXT: bsrl %ecx, %ecx
-; SSE41-NEXT: xorl $7, %ecx
-; SSE41-NEXT: movd %ecx, %xmm1
-; SSE41-NEXT: pinsrb $1, %eax, %xmm1
-; SSE41-NEXT: pextrb $2, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $2, %eax, %xmm1
-; SSE41-NEXT: pextrb $3, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $3, %eax, %xmm1
-; SSE41-NEXT: pextrb $4, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $4, %eax, %xmm1
-; SSE41-NEXT: pextrb $5, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $5, %eax, %xmm1
-; SSE41-NEXT: pextrb $6, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $6, %eax, %xmm1
-; SSE41-NEXT: pextrb $7, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $7, %eax, %xmm1
-; SSE41-NEXT: pextrb $8, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $8, %eax, %xmm1
-; SSE41-NEXT: pextrb $9, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $9, %eax, %xmm1
-; SSE41-NEXT: pextrb $10, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $10, %eax, %xmm1
-; SSE41-NEXT: pextrb $11, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $11, %eax, %xmm1
-; SSE41-NEXT: pextrb $12, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $12, %eax, %xmm1
-; SSE41-NEXT: pextrb $13, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $13, %eax, %xmm1
-; SSE41-NEXT: pextrb $14, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $14, %eax, %xmm1
-; SSE41-NEXT: pextrb $15, %xmm0, %eax
-; SSE41-NEXT: bsrl %eax, %eax
-; SSE41-NEXT: xorl $7, %eax
-; SSE41-NEXT: pinsrb $15, %eax, %xmm1
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pand %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pshufb %xmm3, %xmm4
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pcmpeqb %xmm0, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
+; SSE41-NEXT: pshufb %xmm0, %xmm1
+; SSE41-NEXT: paddb %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8u:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX-NEXT: bsrl %ecx, %ecx
-; AVX-NEXT: xorl $7, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $2, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $3, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $4, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $5, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $6, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $7, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $8, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $9, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $10, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $11, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $12, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $13, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $14, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpextrb $15, %xmm0, %eax
-; AVX-NEXT: bsrl %eax, %eax
-; AVX-NEXT: xorl $7, %eax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i8u:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VLCD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i8u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i8u:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm0
+; X32-SSE-NEXT: pand %xmm2, %xmm0
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: pcmpeqb %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm4, %xmm2
+; X32-SSE-NEXT: pshufb %xmm0, %xmm1
+; X32-SSE-NEXT: paddb %xmm2, %xmm1
+; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
ret <16 x i8> %out
}
@@ -1916,17 +1578,17 @@ define <2 x i64> @foldv2i64() nounwind {
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: foldv2i64:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: movl $55, %eax
-; AVX512VLCD-NEXT: vmovq %rax, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: foldv2i64:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: movl $55, %eax
-; AVX512CD-NEXT: vmovq %rax, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: foldv2i64:
+; AVX512: ## BB#0:
+; AVX512-NEXT: movl $55, %eax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $55, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
}
@@ -1944,17 +1606,17 @@ define <2 x i64> @foldv2i64u() nounwind {
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
;
-; AVX512VLCD-LABEL: foldv2i64u:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: movl $55, %eax
-; AVX512VLCD-NEXT: vmovq %rax, %xmm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: foldv2i64u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: movl $55, %eax
-; AVX512CD-NEXT: vmovq %rax, %xmm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: foldv2i64u:
+; AVX512: ## BB#0:
+; AVX512-NEXT: movl $55, %eax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $55, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
}
@@ -1979,6 +1641,11 @@ define <4 x i32> @foldv4i32() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
ret <4 x i32> %out
}
@@ -2003,6 +1670,11 @@ define <4 x i32> @foldv4i32u() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
ret <4 x i32> %out
}
@@ -2027,6 +1699,11 @@ define <8 x i16> @foldv8i16() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
ret <8 x i16> %out
}
@@ -2051,6 +1728,11 @@ define <8 x i16> @foldv8i16u() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
ret <8 x i16> %out
}
@@ -2075,6 +1757,11 @@ define <16 x i8> @foldv16i8() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
ret <16 x i8> %out
}
@@ -2099,6 +1786,11 @@ define <16 x i8> @foldv16i8u() nounwind {
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
ret <16 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index 1608bf53748d..ed31e49cb07c 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s --check-prefix=AVX512VLCD --check-prefix=ALL --check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=AVX512CD --check-prefix=ALL --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
@@ -12,55 +12,55 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: movl $127, %ecx
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %rax, %xmm3
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
; AVX1-NEXT: cmoveq %rcx, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: movl $127, %ecx
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: cmoveq %rcx, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64:
@@ -70,7 +70,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv4i64:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0)
@@ -83,47 +85,52 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vmovq %rax, %xmm3
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: bsrq %rax, %rax
-; AVX1-NEXT: xorq $63, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: bsrq %rax, %rax
-; AVX2-NEXT: xorq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64u:
@@ -133,7 +140,9 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
;
; AVX512CD-LABEL: testv4i64u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1)
@@ -148,91 +157,64 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: bsrl %eax, %ecx
; AVX1-NEXT: movl $63, %eax
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vmovd %xmm1, %edx
; AVX1-NEXT: bsrl %edx, %edx
; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $31, %edx
; AVX1-NEXT: vmovd %edx, %xmm2
; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vmovd %xmm0, %edx
; AVX1-NEXT: bsrl %edx, %edx
; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $31, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %edx, %xmm3
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $3, %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $31, %ecx
-; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0
+; AVX1-NEXT: vpinsrd $3, %ecx, %xmm3, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %ecx
-; AVX2-NEXT: movl $63, %eax
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $31, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $31, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32:
@@ -242,7 +224,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv8i32:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0)
@@ -255,75 +239,57 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrd $1, %xmm1, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vmovd %xmm1, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $31, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm2
; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $2, %xmm1, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
; AVX1-NEXT: vpextrd $3, %xmm1, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
; AVX1-NEXT: vmovd %xmm0, %ecx
; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $31, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $2, %xmm0, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
-; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
; AVX1-NEXT: vpextrd $3, %xmm0, %eax
; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $31, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $31, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $31, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32u:
@@ -333,7 +299,9 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
;
; AVX512CD-LABEL: testv8i32u:
; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1)
@@ -344,192 +312,65 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %cx
-; AVX1-NEXT: movw $31, %ax
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: bsrw %dx, %dx
-; AVX1-NEXT: cmovew %ax, %dx
-; AVX1-NEXT: xorl $15, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: bsrw %dx, %dx
-; AVX1-NEXT: cmovew %ax, %dx
-; AVX1-NEXT: xorl $15, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: cmovew %ax, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %cx
-; AVX2-NEXT: movw $31, %ax
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: bsrw %dx, %dx
-; AVX2-NEXT: cmovew %ax, %dx
-; AVX2-NEXT: xorl $15, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: bsrw %dx, %dx
-; AVX2-NEXT: cmovew %ax, %dx
-; AVX2-NEXT: xorl $15, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: cmovew %ax, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i16:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i16:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i16:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0)
ret <16 x i16> %out
}
@@ -538,158 +379,65 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm1, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vpextrw $1, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vmovd %xmm0, %ecx
-; AVX1-NEXT: bsrw %cx, %cx
-; AVX1-NEXT: xorl $15, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $2, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $3, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $4, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $5, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $6, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrw $7, %xmm0, %eax
-; AVX1-NEXT: bsrw %ax, %ax
-; AVX1-NEXT: xorl $15, %eax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5
+; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm1, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpextrw $1, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vmovd %xmm0, %ecx
-; AVX2-NEXT: bsrw %cx, %cx
-; AVX2-NEXT: xorl $15, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $2, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $3, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $4, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $6, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrw $7, %xmm0, %eax
-; AVX2-NEXT: bsrw %ax, %ax
-; AVX2-NEXT: xorl $15, %eax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VLCD-LABEL: testv16i16u:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv16i16u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vpmovzxwd %ymm0, %zmm0
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv16i16u:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
%out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1)
ret <16 x i16> %out
}
@@ -698,346 +446,52 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %ecx
-; AVX1-NEXT: movl $15, %eax
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpextrb $0, %xmm1, %edx
-; AVX1-NEXT: bsrl %edx, %edx
-; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $7, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpextrb $0, %xmm0, %edx
-; AVX1-NEXT: bsrl %edx, %edx
-; AVX1-NEXT: cmovel %eax, %edx
-; AVX1-NEXT: xorl $7, %edx
-; AVX1-NEXT: vmovd %edx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: cmovel %eax, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %ecx
-; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm1, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $7, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpextrb $0, %xmm0, %edx
-; AVX2-NEXT: bsrl %edx, %edx
-; AVX2-NEXT: cmovel %eax, %edx
-; AVX2-NEXT: xorl $7, %edx
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: cmovel %eax, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv32i8:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
@@ -1047,12 +501,12 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CD-LABEL: testv32i8:
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
@@ -1066,280 +520,52 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm1, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX1-NEXT: bsrl %ecx, %ecx
-; AVX1-NEXT: xorl $7, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $3, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $5, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $7, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $9, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $11, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $13, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm0, %eax
-; AVX1-NEXT: bsrl %eax, %eax
-; AVX1-NEXT: xorl $7, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
+; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2
+; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8u:
; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm1, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX2-NEXT: bsrl %ecx, %ecx
-; AVX2-NEXT: xorl $7, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $2, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $3, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $4, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $5, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $6, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $7, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $8, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $9, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $10, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $11, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $12, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $13, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $14, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT: vpextrb $15, %xmm0, %eax
-; AVX2-NEXT: bsrl %eax, %eax
-; AVX2-NEXT: xorl $7, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VLCD-LABEL: testv32i8u:
; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VLCD-NEXT: vmovdqa64 {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
@@ -1349,12 +575,12 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CD-LABEL: testv32i8u:
; AVX512CD: ## BB#0:
; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512CD-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
index 20ea86e5d439..4014cfd7ba2c 100644
--- a/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -1,5 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; ALL-LABEL: testv8i64:
@@ -38,28 +39,28 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16:
-; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwd %ymm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; ALL-NEXT: vpsubw %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpmovzxwd %ymm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdw %zmm1, %ymm1
-; ALL-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpsubw %ymm2, %ymm0, %ymm0
@@ -70,28 +71,28 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
}
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpmovzxwd %ymm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; ALL-NEXT: vpsubw %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpmovzxwd %ymm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdw %zmm1, %ymm1
-; ALL-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv32i16u:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm1, %zmm1
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovzxwd %ymm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpsubw %ymm2, %ymm0, %ymm0
@@ -102,51 +103,51 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8:
-; ALL: ## BB#0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vpsubb %xmm3, %xmm0, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdb %zmm1, %xmm1
-; ALL-NEXT: vpsubb %xmm3, %xmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
@@ -158,51 +159,51 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
}
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8u:
-; ALL: ## BB#0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vpsubb %xmm3, %xmm0, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm2, %zmm2
-; ALL-NEXT: vplzcntd %zmm2, %zmm2
-; ALL-NEXT: vpmovdb %zmm2, %xmm2
-; ALL-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; ALL-NEXT: vpmovzxbd %xmm1, %zmm1
-; ALL-NEXT: vplzcntd %zmm1, %zmm1
-; ALL-NEXT: vpmovdb %zmm1, %xmm1
-; ALL-NEXT: vpsubb %xmm3, %xmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
;
; AVX512BW-LABEL: testv64i8u:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm1, %zmm1
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm1, %zmm1
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm2, %zmm2
; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2
; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd %xmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512BW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpsubb %xmm3, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
new file mode 100644
index 000000000000..0718edf5a143
--- /dev/null
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -0,0 +1,495 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; Lower common integer comparisons such as 'isPositive' efficiently:
+; https://llvm.org/bugs/show_bug.cgi?id=26701
+
+define <16 x i8> @test_pcmpgtb(<16 x i8> %x) {
+; SSE-LABEL: test_pcmpgtb:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtb:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <16 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %not = xor <16 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ ret <16 x i8> %not
+}
+
+define <8 x i16> @test_pcmpgtw(<8 x i16> %x) {
+; SSE-LABEL: test_pcmpgtw:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtw:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %not = xor <8 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ ret <8 x i16> %not
+}
+
+define <4 x i32> @test_pcmpgtd(<4 x i32> %x) {
+; SSE-LABEL: test_pcmpgtd:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtd:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+ %not = xor <4 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %not
+}
+
+define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
+; SSE2-LABEL: test_pcmpgtq:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_pcmpgtq:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: test_pcmpgtq:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %sign = ashr <2 x i64> %x, <i64 63, i64 63>
+ %not = xor <2 x i64> %sign, <i64 -1, i64 -1>
+ ret <2 x i64> %not
+}
+
+define <1 x i128> @test_strange_type(<1 x i128> %x) {
+; SSE2-LABEL: test_strange_type:
+; SSE2: # BB#0:
+; SSE2-NEXT: sarq $63, %rsi
+; SSE2-NEXT: movd %rsi, %xmm0
+; SSE2-NEXT: notq %rsi
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
+; SSE2-NEXT: movd %xmm1, %rax
+; SSE2-NEXT: movq %rsi, %rdx
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_strange_type:
+; SSE42: # BB#0:
+; SSE42-NEXT: sarq $63, %rsi
+; SSE42-NEXT: movd %rsi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pxor %xmm0, %xmm1
+; SSE42-NEXT: movd %xmm1, %rax
+; SSE42-NEXT: pextrq $1, %xmm1, %rdx
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_strange_type:
+; AVX1: # BB#0:
+; AVX1-NEXT: sarq $63, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_strange_type:
+; AVX2: # BB#0:
+; AVX2-NEXT: sarq $63, %rsi
+; AVX2-NEXT: vmovq %rsi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: retq
+;
+ %sign = ashr <1 x i128> %x, <i128 127>
+ %not = xor <1 x i128> %sign, <i128 -1>
+ ret <1 x i128> %not
+}
+
+define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) {
+; SSE-LABEL: test_pcmpgtb_256:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtb_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtb_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <32 x i8> %x, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ %not = xor <32 x i8> %sign, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ ret <32 x i8> %not
+}
+
+define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) {
+; SSE-LABEL: test_pcmpgtw_256:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE-NEXT: pcmpgtw %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtw_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtw_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <16 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %not = xor <16 x i16> %sign, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ ret <16 x i16> %not
+}
+
+define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
+; SSE-LABEL: test_pcmpgtd_256:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE-NEXT: pcmpgtd %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtd_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtd_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ %not = xor <8 x i32> %sign, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <8 x i32> %not
+}
+
+define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
+; SSE2-LABEL: test_pcmpgtq_256:
+; SSE2: # BB#0:
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: test_pcmpgtq_256:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: test_pcmpgtq_256:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_pcmpgtq_256:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %sign = ashr <4 x i64> %x, <i64 63, i64 63, i64 63, i64 63>
+ %not = xor <4 x i64> %sign, <i64 -1, i64 -1, i64 -1, i64 -1>
+ ret <4 x i64> %not
+}
+
+define <16 x i8> @cmpeq_zext_v16i8(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: cmpeq_zext_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmpeq_zext_v16i8:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp eq <16 x i8> %a, %b
+ %zext = zext <16 x i1> %cmp to <16 x i8>
+ ret <16 x i8> %zext
+}
+
+define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: cmpeq_zext_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
+; SSE-NEXT: pcmpeqw %xmm3, %xmm1
+; SSE-NEXT: psrlw $15, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: cmpeq_zext_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpeq_zext_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp eq <16 x i16> %a, %b
+ %zext = zext <16 x i1> %cmp to <16 x i16>
+ ret <16 x i16> %zext
+}
+
+define <4 x i32> @cmpeq_zext_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: cmpeq_zext_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmpeq_zext_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp eq <4 x i32> %a, %b
+ %zext = zext <4 x i1> %cmp to <4 x i32>
+ ret <4 x i32> %zext
+}
+
+define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: cmpeq_zext_v4i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: cmpeq_zext_v4i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT: psrlq $63, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm3, %xmm1
+; SSE42-NEXT: psrlq $63, %xmm1
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: cmpeq_zext_v4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpeq_zext_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp eq <4 x i64> %a, %b
+ %zext = zext <4 x i1> %cmp to <4 x i64>
+ ret <4 x i64> %zext
+}
+
+define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) {
+; SSE-LABEL: cmpgt_zext_v32i8:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: cmpgt_zext_v32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpgt_zext_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp sgt <32 x i8> %a, %b
+ %zext = zext <32 x i1> %cmp to <32 x i8>
+ ret <32 x i8> %zext
+}
+
+define <8 x i16> @cmpgt_zext_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: cmpgt_zext_v8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmpgt_zext_v8i16:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <8 x i16> %a, %b
+ %zext = zext <8 x i1> %cmp to <8 x i16>
+ ret <8 x i16> %zext
+}
+
+define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: cmpgt_zext_v8i32:
+; SSE: # BB#0:
+; SSE-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
+; SSE-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE-NEXT: psrld $31, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: cmpgt_zext_v8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cmpgt_zext_v8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+ %cmp = icmp sgt <8 x i32> %a, %b
+ %zext = zext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %zext
+}
+
+define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: cmpgt_zext_v2i64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: cmpgt_zext_v2i64:
+; SSE42: # BB#0:
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: psrlq $63, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: cmpgt_zext_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+ %cmp = icmp sgt <2 x i64> %a, %b
+ %zext = zext <2 x i1> %cmp to <2 x i64>
+ ret <2 x i64> %zext
+}
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index 54b7af6830c0..cf4f21e62b61 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -1,156 +1,174 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
-; ALL-LABEL: testv8i64:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrq $1, %xmm1, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vmovq %xmm1, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm1
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vpextrq $1, %xmm0, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm0, %rax
-; ALL-NEXT: popcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm0
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv8i64:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv8i64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
ret <8 x i64> %out
}
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
-; ALL-LABEL: testv16i32:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrd $1, %xmm1, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm1, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm2
-; ALL-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $2, %xmm1, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $3, %xmm1, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm0, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm0, %ecx
-; ALL-NEXT: popcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm0, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm0, %eax
-; ALL-NEXT: popcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv16i32:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
+; AVX512F-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv16i32:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
ret <16 x i32> %out
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16:
-; ALL: ## BB#0:
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vpsllw $8, %ymm0, %ymm3
-; ALL-NEXT: vpaddb %ymm0, %ymm3, %ymm0
-; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
-; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv32i16:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv32i16:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8:
-; ALL: ## BB#0:
-; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
-; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512F-LABEL: testv64i8:
+; AVX512F: ## BB#0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: testv64i8:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-rem.ll b/test/CodeGen/X86/vector-rem.ll
index 51cd872643f2..5fb37ec8710f 100644
--- a/test/CodeGen/X86/vector-rem.ll
+++ b/test/CodeGen/X86/vector-rem.ll
@@ -1,15 +1,117 @@
-; RUN: llc < %s -march=x86-64 | grep div | count 8
-; RUN: llc < %s -march=x86-64 | grep fmodf | count 4
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) {
- %m = srem <4 x i32> %t, %u
- ret <4 x i32> %m
+; CHECK-LABEL: foo:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm3
+; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+ %m = srem <4 x i32> %t, %u
+ ret <4 x i32> %m
}
+
define <4 x i32> @bar(<4 x i32> %t, <4 x i32> %u) {
- %m = urem <4 x i32> %t, %u
- ret <4 x i32> %m
+; CHECK-LABEL: bar:
+; CHECK: # BB#0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT: movd %xmm2, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; CHECK-NEXT: movd %xmm3, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm3
+; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
+;
+ %m = urem <4 x i32> %t, %u
+ ret <4 x i32> %m
}
+
define <4 x float> @qux(<4 x float> %t, <4 x float> %u) {
- %m = frem <4 x float> %t, %u
- ret <4 x float> %m
+; CHECK-LABEL: qux:
+; CHECK: # BB#0:
+; CHECK-NEXT: subq $72, %rsp
+; CHECK: movaps %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: movaps %xmm0, (%rsp)
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: unpcklps (%rsp), %xmm0
+; CHECK: movaps %xmm0, (%rsp)
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; CHECK-NEXT: callq fmodf
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT: unpcklps (%rsp), %xmm1
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: addq $72, %rsp
+; CHECK-NEXT: retq
+;
+ %m = frem <4 x float> %t, %u
+ ret <4 x float> %m
}
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 4ad4aa46c5a0..50febd4c1ec7 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -215,7 +215,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
; X32-SSE-NEXT: psubd %xmm1, %xmm2
; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd .LCPI1_1, %xmm1
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
; X32-SSE-NEXT: pmuludq %xmm0, %xmm1
@@ -667,7 +667,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psllw $4, %xmm5
-; X32-SSE-NEXT: pand .LCPI3_1, %xmm5
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm5
; X32-SSE-NEXT: pand %xmm2, %xmm5
; X32-SSE-NEXT: pandn %xmm0, %xmm2
; X32-SSE-NEXT: por %xmm5, %xmm2
@@ -677,7 +677,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm2, %xmm6
; X32-SSE-NEXT: psllw $2, %xmm2
-; X32-SSE-NEXT: pand .LCPI3_2, %xmm2
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: pand %xmm5, %xmm2
; X32-SSE-NEXT: por %xmm6, %xmm2
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -693,7 +693,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -702,7 +702,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_4, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -710,7 +710,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_5, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: por %xmm1, %xmm0
@@ -955,44 +955,34 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_rotate_v8i16:
; SSE41: # BB#0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [256,61680,57568,53456,49344,45232,41120,37008]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $4, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [512,57824,49600,41376,33152,24928,16704,8480]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $2, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [1024,50112,33664,17216,768,49856,33408,16960]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrlw $1, %xmm3
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [2048,34688,1792,34432,1536,34176,1280,33920]
-; SSE41-NEXT: pblendvb %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; SSE41-NEXT: pmullw %xmm0, %xmm1
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrlw $8, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psrlw $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4],xmm2[5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: psrlw $2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_rotate_v8i16:
; AVX1: # BB#0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,61680,57568,53456,49344,45232,41120,37008]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,57824,49600,41376,33152,24928,16704,8480]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4],xmm0[5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,50112,33664,17216,768,49856,33408,16960]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2048,34688,1792,34432,1536,34176,1280,33920]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -1000,8 +990,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
@@ -1202,7 +1191,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: pcmpgtb %xmm3, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm4
-; X32-SSE-NEXT: pand .LCPI7_1, %xmm4
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm4
; X32-SSE-NEXT: pand %xmm1, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm1
; X32-SSE-NEXT: por %xmm4, %xmm1
@@ -1212,7 +1201,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm4, %xmm5
; X32-SSE-NEXT: pandn %xmm1, %xmm5
; X32-SSE-NEXT: psllw $2, %xmm1
-; X32-SSE-NEXT: pand .LCPI7_2, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: pand %xmm4, %xmm1
; X32-SSE-NEXT: por %xmm5, %xmm1
; X32-SSE-NEXT: paddb %xmm3, %xmm3
@@ -1229,7 +1218,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_4, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -1238,7 +1227,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm5, %xmm6
; X32-SSE-NEXT: pandn %xmm0, %xmm6
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_5, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm5, %xmm0
; X32-SSE-NEXT: por %xmm6, %xmm0
; X32-SSE-NEXT: paddb %xmm4, %xmm4
@@ -1246,7 +1235,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm2, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_6, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: por %xmm3, %xmm0
@@ -1393,9 +1382,9 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $4, %xmm1
-; X32-SSE-NEXT: pand .LCPI11_0, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
@@ -1440,8 +1429,8 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllq $15, %xmm1
; X32-SSE-NEXT: psrlq $49, %xmm0
-; X32-SSE-NEXT: pand .LCPI12_0, %xmm0
-; X32-SSE-NEXT: pand .LCPI12_1, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -1485,8 +1474,8 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pslld $4, %xmm1
; X32-SSE-NEXT: psrld $28, %xmm0
-; X32-SSE-NEXT: pand .LCPI13_0, %xmm0
-; X32-SSE-NEXT: pand .LCPI13_1, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -1530,8 +1519,8 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $5, %xmm1
; X32-SSE-NEXT: psrlw $11, %xmm0
-; X32-SSE-NEXT: pand .LCPI14_0, %xmm0
-; X32-SSE-NEXT: pand .LCPI14_1, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -1578,11 +1567,11 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
; X32-SSE: # BB#0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $4, %xmm1
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_1, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_2, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_3, %xmm1
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index 379b5fcb635f..af1755e14314 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -223,11 +223,11 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
; AVX2-NEXT: vpsllvd %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX2-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
@@ -498,59 +498,51 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
ret <8 x i32> %or
}
-define <16 x i16> @constant_rotate_v8i16(<16 x i16> %a) nounwind {
-; AVX1-LABEL: constant_rotate_v8i16:
+define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
+; AVX1-LABEL: constant_rotate_v16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,28784,24672,20560,16448,12336,8224,4112]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,57568,49344,41120,32896,24672,16448,8224]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,49600,33152,16704,256,49344,32896,16448]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6],xmm2[7]
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,33664,768,33408,512,33152,256,32896]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,61680,57568,53456,49344,45232,41120,37008]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4,5,6,7]
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,57824,49600,41376,33152,24928,16704,8480]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4],xmm0[5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,50112,33664,17216,768,49856,33408,16960]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3,4],xmm3[5,6],xmm0[7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2048,34688,1792,34432,1536,34176,1280,33920]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: constant_rotate_v8i16:
+; AVX2-LABEL: constant_rotate_v16i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4
; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX2-NEXT: vpsrlvd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; XOPAVX1-LABEL: constant_rotate_v8i16:
+; XOPAVX1-LABEL: constant_rotate_v16i16:
; XOPAVX1: # BB#0:
; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -565,7 +557,7 @@ define <16 x i16> @constant_rotate_v8i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; XOPAVX1-NEXT: retq
;
-; XOPAVX2-LABEL: constant_rotate_v8i16:
+; XOPAVX2-LABEL: constant_rotate_v16i16:
; XOPAVX2: # BB#0:
; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index b63c3f084b22..018c5922a432 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -4,6 +4,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
;
; Just one 32-bit run to make sure we do reasonable things there.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
@@ -81,6 +82,11 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_16i8_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2
@@ -143,14 +149,12 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSSE3-LABEL: sext_16i8_to_8i32:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i32:
@@ -171,11 +175,14 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX2-LABEL: sext_16i8_to_8i32:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_16i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_16i8_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2
@@ -285,12 +292,14 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX2-LABEL: sext_16i8_to_4i64:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_16i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_16i8_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
@@ -304,6 +313,137 @@ entry:
ret <4 x i64> %C
}
+define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_16i8_to_8i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: psrad $24, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: psrld $16, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: psrad $24, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_16i8_to_8i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSSE3-NEXT: psrld $16, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: psrad $24, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: psrad $24, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT: psrld $16, %xmm3
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
+; SSSE3-NEXT: psrad $24, %xmm3
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_16i8_to_8i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pmovsxbq %xmm1, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovsxbq %xmm2, %xmm2
+; SSE41-NEXT: psrlq $48, %xmm0
+; SSE41-NEXT: pmovsxbq %xmm0, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sext_16i8_to_8i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
+; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxbq %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sext_16i8_to_8i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $24, %xmm1, %xmm1
+; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
+; AVX2-NEXT: vmovdqa %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: sext_16i8_to_8i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpsllq $56, %zmm0, %zmm0
+; AVX512-NEXT: vpsraq $56, %zmm0, %zmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_8i64:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4
+; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE41-NEXT: psrld $16, %xmm1
+; X32-SSE41-NEXT: pmovsxbq %xmm1, %xmm1
+; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; X32-SSE41-NEXT: pmovsxbq %xmm2, %xmm2
+; X32-SSE41-NEXT: psrlq $48, %xmm0
+; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm3
+; X32-SSE41-NEXT: movdqa %xmm4, %xmm0
+; X32-SSE41-NEXT: retl
+entry:
+ %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %C = sext <8 x i8> %B to <8 x i64>
+ ret <8 x i64> %C
+}
+
define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_4i32:
; SSE2: # BB#0: # %entry
@@ -377,6 +517,11 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_8i16_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
@@ -479,12 +624,14 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
;
; AVX2-LABEL: sext_8i16_to_4i64:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_8i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_8i16_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2
@@ -577,6 +724,11 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_4i32_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
@@ -603,18 +755,40 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; AVX-LABEL: load_sext_2i1_to_2i64:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movzbl (%rdi), %eax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
-; AVX-NEXT: shlq $63, %rax
-; AVX-NEXT: sarq $63, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_2i1_to_2i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movzbl (%rdi), %eax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: shlq $63, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_2i1_to_2i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movzbl (%rdi), %eax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: shlq $63, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_2i1_to_2i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
; X32-SSE41: # BB#0: # %entry
@@ -749,25 +923,55 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; SSE41-NEXT: pinsrd $3, %eax, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: load_sext_4i1_to_4i32:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movzbl (%rdi), %eax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: shlq $63, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $61, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shlq $60, %rax
-; AVX-NEXT: sarq $63, %rax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_4i1_to_4i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movzbl (%rdi), %eax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shlq $60, %rax
+; AVX1-NEXT: sarq $63, %rax
+; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_4i1_to_4i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movzbl (%rdi), %eax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shlq $60, %rax
+; AVX2-NEXT: sarq $63, %rax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_4i1_to_4i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
; X32-SSE41: # BB#0: # %entry
@@ -836,24 +1040,20 @@ entry:
define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSE2-LABEL: load_sext_4i1_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movzbl (%rdi), %eax
+; SSE2-NEXT: movl (%rdi), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $3, %ecx
-; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl %ecx
-; SSE2-NEXT: andl $1, %ecx
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: andl $1, %ecx
-; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: shrl $2, %eax
-; SSE2-NEXT: andl $1, %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
; SSE2-NEXT: psllq $63, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
@@ -866,24 +1066,20 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
;
; SSSE3-LABEL: load_sext_4i1_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movzbl (%rdi), %eax
+; SSSE3-NEXT: movl (%rdi), %eax
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $3, %ecx
-; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm0
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl %ecx
-; SSSE3-NEXT: andl $1, %ecx
; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: andl $1, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: shrl $2, %eax
-; SSSE3-NEXT: andl $1, %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
; SSSE3-NEXT: psllq $63, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
@@ -896,21 +1092,17 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
;
; SSE41-LABEL: load_sext_4i1_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movzbl (%rdi), %eax
+; SSE41-NEXT: movl (%rdi), %eax
; SSE41-NEXT: movl %eax, %ecx
; SSE41-NEXT: shrl %ecx
-; SSE41-NEXT: andl $1, %ecx
-; SSE41-NEXT: movl %eax, %edx
-; SSE41-NEXT: andl $1, %edx
-; SSE41-NEXT: movd %edx, %xmm1
+; SSE41-NEXT: movd %eax, %xmm1
; SSE41-NEXT: pinsrd $1, %ecx, %xmm1
; SSE41-NEXT: movl %eax, %ecx
; SSE41-NEXT: shrl $2, %ecx
-; SSE41-NEXT: andl $1, %ecx
; SSE41-NEXT: pinsrd $2, %ecx, %xmm1
; SSE41-NEXT: shrl $3, %eax
-; SSE41-NEXT: andl $1, %eax
; SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: psllq $63, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
@@ -968,24 +1160,29 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i1_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movzbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shrl %ecx
-; X32-SSE41-NEXT: andl $1, %ecx
-; X32-SSE41-NEXT: movl %eax, %edx
-; X32-SSE41-NEXT: andl $1, %edx
-; X32-SSE41-NEXT: movd %edx, %xmm1
+; X32-SSE41-NEXT: movd %eax, %xmm1
; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm1
; X32-SSE41-NEXT: movl %eax, %ecx
; X32-SSE41-NEXT: shrl $2, %ecx
-; X32-SSE41-NEXT: andl $1, %ecx
; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm1
; X32-SSE41-NEXT: shrl $3, %eax
-; X32-SSE41-NEXT: andl $1, %eax
; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; X32-SSE41-NEXT: psllq $63, %xmm0
; X32-SSE41-NEXT: psrad $31, %xmm0
@@ -1050,6 +1247,11 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1182,40 +1384,84 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; SSE41-NEXT: pinsrw $7, %eax, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: load_sext_8i1_to_8i16:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movsbq (%rdi), %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: shlq $63, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $61, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $60, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $59, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $58, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $57, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shrq $7, %rax
-; AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_8i1_to_8i16:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movsbq (%rdi), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $60, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $59, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $58, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $57, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrq $7, %rax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_8i1_to_8i16:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movsbq (%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $59, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $58, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $57, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrq $7, %rax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_8i1_to_8i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
; X32-SSE41: # BB#0: # %entry
@@ -1294,6 +1540,102 @@ entry:
ret <8 x i16> %Y
}
+define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_8i8_to_8i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movsbq 1(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movsbq (%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movsbq 3(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: movsbq 2(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: movsbq 5(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm3
+; SSE2-NEXT: movsbq 4(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: movsbq 7(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm4
+; SSE2-NEXT: movsbq 6(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_sext_8i8_to_8i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movsbq 1(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movsbq (%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movsbq 3(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: movsbq 2(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT: movsbq 5(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm3
+; SSSE3-NEXT: movsbq 4(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT: movsbq 7(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm4
+; SSSE3-NEXT: movsbq 6(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm3
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_sext_8i8_to_8i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
+; SSE41-NEXT: pmovsxbq 4(%rdi), %xmm2
+; SSE41-NEXT: pmovsxbq 6(%rdi), %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_sext_8i8_to_8i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_8i8_to_8i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT: vpmovsxbq 4(%rdi), %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_8i8_to_8i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0
+; AVX512-NEXT: retq
+;
+; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
+; X32-SSE41: # BB#0: # %entry
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
+; X32-SSE41-NEXT: pmovsxbq 4(%eax), %xmm2
+; X32-SSE41-NEXT: pmovsxbq 6(%eax), %xmm3
+; X32-SSE41-NEXT: retl
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i64>
+ ret <8 x i64> %Y
+}
+
define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSE2-LABEL: load_sext_8i1_to_8i32:
; SSE2: # BB#0: # %entry
@@ -1506,6 +1848,15 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_8i1_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: movzbl (%rdi), %eax
+; AVX512-NEXT: kmovw %eax, %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1597,6 +1948,11 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1864,71 +2220,145 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: load_sext_16i1_to_16i8:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: movswq (%rdi), %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $62, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: shlq $63, %rdx
-; AVX-NEXT: sarq $63, %rdx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $61, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $60, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $59, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $58, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $57, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movsbq %al, %rcx
-; AVX-NEXT: shrq $7, %rcx
-; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $55, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $54, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $53, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $52, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $51, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $50, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $49, %rcx
-; AVX-NEXT: sarq $63, %rcx
-; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX-NEXT: shrq $15, %rax
-; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: load_sext_16i1_to_16i8:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: movswq (%rdi), %rax
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $62, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shlq $63, %rdx
+; AVX1-NEXT: sarq $63, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $61, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $60, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $59, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $58, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $57, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movsbq %al, %rcx
+; AVX1-NEXT: shrq $7, %rcx
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $55, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $54, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $53, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $52, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $51, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $50, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shlq $49, %rcx
+; AVX1-NEXT: sarq $63, %rcx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrq $15, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_sext_16i1_to_16i8:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: movswq (%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $62, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: shlq $63, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm0
+; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $61, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $60, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $59, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $58, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $57, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movsbq %al, %rcx
+; AVX2-NEXT: shrq $7, %rcx
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $55, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $54, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $53, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $52, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $51, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $50, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shlq $49, %rcx
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrq $15, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_sext_16i1_to_16i8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: kmovw (%rdi), %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
; X32-SSE41: # BB#0: # %entry
@@ -2460,6 +2890,14 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_16i1_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: kmovw (%rdi), %k1
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3297,6 +3735,18 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_32i1_to_32i8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: kmovw (%rdi), %k1
+; AVX512-NEXT: kmovw 2(%rdi), %k2
+; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z}
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: pushl %esi
@@ -3472,6 +3922,11 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3611,6 +4066,11 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3662,6 +4122,11 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3755,6 +4220,11 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: load_sext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -3864,6 +4334,13 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_4i1_to_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_4i1_to_4i64:
; X32-SSE41: # BB#0:
; X32-SSE41-NEXT: pslld $31, %xmm0
@@ -3931,6 +4408,13 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
+; AVX512-LABEL: sext_4i8_to_4i64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: retq
+;
; X32-SSE41-LABEL: sext_4i8_to_4i64:
; X32-SSE41: # BB#0:
; X32-SSE41-NEXT: pslld $24, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index 771445df85e0..81eaeb998075 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -311,6 +311,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -323,7 +324,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
@@ -677,7 +681,7 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v4i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
@@ -745,9 +749,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE2-NEXT: psllw $5, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
@@ -949,9 +952,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; X32-SSE-NEXT: psllw $5, %xmm3
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
@@ -1194,48 +1196,33 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE41-LABEL: constant_shift_v8i16:
; SSE41: # BB#0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $8, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psraw $4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psraw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1248,8 +1235,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1656,7 +1645,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; X32-SSE-NEXT: pxor %xmm1, %xmm0
; X32-SSE-NEXT: psubb %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 0b9c318da047..af076fbbd818 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -178,11 +178,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
@@ -214,7 +214,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = ashr <16 x i16> %a, %b
ret <16 x i16> %shift
@@ -458,7 +461,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v8i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -763,30 +766,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsraw $8, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsraw $2, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -795,12 +787,12 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX2-NEXT: vpsravd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -829,8 +821,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 147e58f4710e..8183292c77fc 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -28,20 +28,20 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
; AVX512DQ-NEXT: vpsravd %ymm5, %ymm6, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
; AVX512DQ-NEXT: vpsravd %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
; AVX512DQ-NEXT: vpsravd %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
; AVX512DQ-NEXT: vpsravd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
@@ -129,7 +129,7 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
; ALL-LABEL: splatvar_shift_v16i32:
; ALL: ## BB#0:
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -238,21 +238,21 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX512DQ-NEXT: vpsravd %ymm4, %ymm5, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsravd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-NEXT: vpsravd %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpsravd %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsravd %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX512DQ-NEXT: vpsravd %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vpsrld $16, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-NEXT: vpsravd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
@@ -376,3 +376,21 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
%shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <64 x i8> %shift
}
+
+define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) {
+; AVX512DQ-LABEL: ashr_const7_v64i8:
+; AVX512DQ: ## BB#0:
+; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: ashr_const7_v64i8:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: retq
+ %res = ashr <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 86e54612ae74..213e2a41a662 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -280,6 +280,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -292,7 +293,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
@@ -437,7 +441,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -446,7 +450,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -454,7 +458,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm2, %xmm1
; X32-SSE-NEXT: pandn %xmm0, %xmm1
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm2, %xmm0
; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -529,7 +533,7 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v4i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
@@ -597,9 +601,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -727,9 +730,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -737,7 +739,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -746,7 +748,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -754,7 +756,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: pandn %xmm0, %xmm2
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
@@ -928,48 +930,33 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE41-LABEL: constant_shift_v8i16:
; SSE41: # BB#0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $8, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
+; SSE41-NEXT: psrlw $4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
-; SSE41-NEXT: pblendvb %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: psrlw $1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -982,8 +969,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
@@ -1112,7 +1101,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -1121,7 +1110,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -1129,7 +1118,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: pandn %xmm0, %xmm2
; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_3, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm1, %xmm0
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
@@ -1257,7 +1246,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index ecc68cf2e278..f9ff3092388b 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -155,11 +155,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
@@ -191,7 +191,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = lshr <16 x i16> %a, %b
ret <16 x i16> %shift
@@ -369,7 +372,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v8i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -618,30 +621,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
-; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -650,12 +642,12 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -684,8 +676,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index 68644e61b0e5..a7759aa9472a 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -29,20 +29,20 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
; AVX512DQ-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
@@ -110,7 +110,7 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
; ALL-LABEL: splatvar_shift_v16i32:
; ALL: ## BB#0:
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -202,21 +202,21 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm5, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrld $16, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-NEXT: vpsrlvd %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
+; AVX512DQ-NEXT: vpsrlvd %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vpsrld $16, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpackusdw %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 9b59c6224ef2..7202f1ec0cb8 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -131,7 +131,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; X32-SSE-LABEL: var_shift_v4i32:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd .LCPI1_0, %xmm1
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; X32-SSE-NEXT: pmuludq %xmm0, %xmm1
@@ -237,6 +237,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -247,7 +248,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
@@ -386,7 +390,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -395,7 +399,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI3_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm1, %xmm1
@@ -477,7 +481,7 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v4i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
@@ -545,9 +549,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; SSE2-NEXT: psllw $5, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -667,9 +670,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-LABEL: splatvar_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; X32-SSE-NEXT: psllw $5, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -677,7 +679,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -686,7 +688,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI7_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -839,13 +841,15 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v8i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pmullw .LCPI10_0, %xmm0
+; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <8 x i16> %shift
@@ -951,7 +955,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_1, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -960,7 +964,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-NEXT: movdqa %xmm3, %xmm4
; X32-SSE-NEXT: pandn %xmm0, %xmm4
; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand .LCPI11_2, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand %xmm3, %xmm0
; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: paddb %xmm2, %xmm2
@@ -1093,7 +1097,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; X32-SSE-LABEL: splatconstant_shift_v16i8:
; X32-SSE: # BB#0:
; X32-SSE-NEXT: psllw $3, %xmm0
-; X32-SSE-NEXT: pand .LCPI15_0, %xmm0
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 3daf24f1a82e..bc7d20cd86d8 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -136,11 +136,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
@@ -166,7 +166,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
;
; AVX512-LABEL: var_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = shl <16 x i16> %a, %b
ret <16 x i16> %shift
@@ -333,7 +336,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX512-LABEL: splatvar_shift_v8i32:
; AVX512: ## BB#0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -585,8 +588,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
;
; AVX512-LABEL: constant_shift_v16i16:
; AVX512: ## BB#0:
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512-NEXT: retq
%shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
ret <16 x i16> %shift
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index 26ddb1c127e1..ac867c70f15f 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -29,20 +29,20 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ: ## BB#0:
; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
; AVX512DQ-NEXT: vpsllvd %ymm5, %ymm6, %ymm5
; AVX512DQ-NEXT: vpsrld $16, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512DQ-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
; AVX512DQ-NEXT: vpsllvd %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpsrld $16, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
; AVX512DQ-NEXT: vpsllvd %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
@@ -106,7 +106,7 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
; ALL-LABEL: splatvar_shift_v16i32:
; ALL: ## BB#0:
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 13a9543ddd90..2651063379ff 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -9,20 +9,11 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
-; FIXME: SSE2 should look like the following:
-; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
-; FIXME: # BB#0:
-; FIXME-NEXT: punpcklbw %xmm0, %xmm0
-; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; FIXME-NEXT: retq
-;
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
@@ -55,9 +46,8 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
@@ -82,10 +72,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; SSE2: # BB#0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
@@ -182,33 +172,16 @@ define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(
}
define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
-; FIXME: SSE2 should be the following:
-; FIXME-LABEL: @shuffle_v16i8_0101010101010101
-; FIXME: # BB#0:
-; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; FIXME-NEXT: retq
-;
-; SSE2-LABEL: shuffle_v16i8_0101010101010101:
-; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v16i8_0101010101010101:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v16i8_0101010101010101:
-; SSE41: # BB#0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSE41-NEXT: retq
+; SSE-LABEL: shuffle_v16i8_0101010101010101:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_0101010101010101:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i8_0101010101010101:
@@ -252,9 +225,8 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pandn %xmm0, %xmm2
; SSE2-NEXT: por %xmm1, %xmm2
@@ -953,7 +925,7 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(
; SSE2: # BB#0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
@@ -1132,9 +1104,8 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
; SSE2-NEXT: packuswb %xmm5, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
@@ -1377,9 +1348,8 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
@@ -1420,9 +1390,8 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
@@ -1466,9 +1435,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
@@ -1505,9 +1473,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
@@ -1545,9 +1512,8 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
@@ -1592,9 +1558,8 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 1d32f9e38523..0d50205aa4a5 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -159,7 +159,7 @@ define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: shuffle_v2f64_11:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
ret <2 x double> %shuffle
@@ -217,7 +217,7 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
;
; AVX-LABEL: shuffle_v2f64_33:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
ret <2 x double> %shuffle
@@ -762,7 +762,7 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
;
; AVX512VL-LABEL: shuffle_v2i64_z1:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
@@ -804,7 +804,7 @@ define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
;
; AVX512VL-LABEL: shuffle_v2f64_1z:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
@@ -833,7 +833,7 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
;
; AVX512VL-LABEL: shuffle_v2f64_z0:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0>
@@ -865,11 +865,23 @@ define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
-; AVX-LABEL: shuffle_v2f64_z1:
-; AVX: # BB#0:
-; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; AVX-NEXT: retq
+; AVX1-LABEL: shuffle_v2f64_z1:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v2f64_z1:
+; AVX2: # BB#0:
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v2f64_z1:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
}
@@ -895,7 +907,7 @@ define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
;
; AVX512VL-LABEL: shuffle_v2f64_bitcast_1z:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; AVX512VL-NEXT: retq
%shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
@@ -929,33 +941,25 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
;
; SSE41-LABEL: shuffle_v2i64_bitcast_z123:
; SSE41: # BB#0:
-; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT: xorps %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_bitcast_z123:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_bitcast_z123:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_bitcast_z123:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovss {{.*}}(%rip), %xmm1
-; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxord %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX512VL-NEXT: retq
%bitcast32 = bitcast <2 x i64> %x to <4 x float>
@@ -986,20 +990,10 @@ define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_mem_and_zero_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v2i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v2i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_and_zero_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -1027,20 +1021,10 @@ define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_mem_and_zero_v2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovsd (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_and_zero_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -1097,17 +1081,17 @@ define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
; SSE2-LABEL: insert_mem_lo_v2i64:
; SSE2: # BB#0:
-; SSE2-NEXT: movlpd (%rdi), %xmm0
+; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_mem_lo_v2i64:
; SSE3: # BB#0:
-; SSE3-NEXT: movlpd (%rdi), %xmm0
+; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_mem_lo_v2i64:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movlpd (%rdi), %xmm0
+; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_mem_lo_v2i64:
@@ -1130,7 +1114,7 @@ define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
;
; AVX512VL-LABEL: insert_mem_lo_v2i64:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm1
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: retq
%a = load i64, i64* %ptr
@@ -1163,23 +1147,11 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_mem_hi_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_hi_v2i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_hi_v2i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_hi_v2i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
@@ -1193,20 +1165,10 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: insert_reg_lo_v2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_reg_lo_v2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_reg_lo_v2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_reg_lo_v2f64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: retq
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %shuffle
@@ -1215,12 +1177,12 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
; SSE-LABEL: insert_mem_lo_v2f64:
; SSE: # BB#0:
-; SSE-NEXT: movlpd (%rdi), %xmm0
+; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_lo_v2f64:
; AVX: # BB#0:
-; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; AVX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1247,12 +1209,12 @@ define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) {
define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
; SSE-LABEL: insert_mem_hi_v2f64:
; SSE: # BB#0:
-; SSE-NEXT: movhpd (%rdi), %xmm0
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v2f64:
; AVX: # BB#0:
-; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 53dbb32235ae..aaf5fa673a15 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -227,7 +227,7 @@ define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: shuffle_v4f32_0011:
; AVX: # BB#0:
-; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
ret <4 x float> %shuffle
@@ -240,7 +240,7 @@ define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
;
; AVX-LABEL: shuffle_v4f32_2233:
; AVX: # BB#0:
-; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
ret <4 x float> %shuffle
@@ -302,6 +302,35 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_0145:
+; SSE: # BB#0:
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_0145:
+; AVX: # BB#0:
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_6723:
+; SSE: # BB#0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4f32_6723:
+; AVX: # BB#0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0124:
; SSE2: # BB#0:
@@ -1080,15 +1109,11 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
; SSE41-LABEL: shuffle_v4f32_0zz6:
; SSE41: # BB#0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz6:
; AVX: # BB#0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
@@ -1129,15 +1154,11 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSE41-LABEL: shuffle_v4f32_0z24:
; SSE41: # BB#0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
; AVX: # BB#0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
@@ -1805,6 +1826,162 @@ define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
ret <4 x float> %3
}
+define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: mask_v4f32_4127:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: mask_v4f32_4127:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: mask_v4f32_4127:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v4f32_4127:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mask_v4f32_4127:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a to <4 x i32>
+ %2 = bitcast <4 x float> %b to <4 x i32>
+ %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0>
+ %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1>
+ %5 = or <4 x i32> %4, %3
+ %6 = bitcast <4 x i32> %5 to <4 x float>
+ ret <4 x float> %6
+}
+
+define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: mask_v4f32_0127:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: mask_v4f32_0127:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: mask_v4f32_0127:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v4f32_0127:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mask_v4f32_0127:
+; AVX: # BB#0:
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a to <2 x i64>
+ %2 = bitcast <4 x float> %b to <2 x i64>
+ %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
+ %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
+ %5 = or <2 x i64> %4, %3
+ %6 = bitcast <2 x i64> %5 to <4 x float>
+ ret <4 x float> %6
+}
+
+define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: mask_v4i32_0127:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: mask_v4i32_0127:
+; SSE3: # BB#0:
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: mask_v4i32_0127:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v4i32_0127:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: mask_v4i32_0127:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mask_v4i32_0127:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT: retq
+ %1 = bitcast <4 x i32> %a to <2 x i64>
+ %2 = bitcast <4 x i32> %b to <2 x i64>
+ %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
+ %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
+ %5 = or <2 x i64> %4, %3
+ %6 = bitcast <2 x i64> %5 to <4 x i32>
+ ret <4 x i32> %6
+}
+
+define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
+; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSE3: # BB#0:
+; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
+; AVX: # BB#0:
+; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX-NEXT: retq
+ %1 = load <2 x float>, <2 x float>* %x, align 1
+ %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x float> %2
+}
+
define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
; SSE-LABEL: insert_reg_and_zero_v4i32:
; SSE: # BB#0:
@@ -1935,17 +2112,17 @@ define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
; SSE2-LABEL: insert_mem_lo_v4i32:
; SSE2: # BB#0:
-; SSE2-NEXT: movlpd (%rdi), %xmm0
+; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_mem_lo_v4i32:
; SSE3: # BB#0:
-; SSE3-NEXT: movlpd (%rdi), %xmm0
+; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_mem_lo_v4i32:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movlpd (%rdi), %xmm0
+; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_mem_lo_v4i32:
@@ -2027,12 +2204,12 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE-LABEL: insert_mem_lo_v4f32:
; SSE: # BB#0:
-; SSE-NEXT: movlpd (%rdi), %xmm0
+; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_lo_v4f32:
; AVX: # BB#0:
-; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; AVX-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -2060,12 +2237,12 @@ define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE-LABEL: insert_mem_hi_v4f32:
; SSE: # BB#0:
-; SSE-NEXT: movhpd (%rdi), %xmm0
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v4f32:
; AVX: # BB#0:
-; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 168b3e33bfcf..e64ca967eaa9 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -61,26 +61,16 @@ define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: shuffle_v8i16_00000000:
-; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuffle_v8i16_00000000:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuffle_v8i16_00000000:
-; SSE41: # BB#0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSE41-NEXT: retq
+; SSE-LABEL: shuffle_v8i16_00000000:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_00000000:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i16_00000000:
@@ -2147,13 +2137,50 @@ define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
ret <8 x i16> %shuffle
}
+define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: mask_v8i16_012345ef:
+; SSE2: # BB#0:
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: mask_v8i16_012345ef:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_v8i16_012345ef:
+; SSE41: # BB#0:
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: mask_v8i16_012345ef:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mask_v8i16_012345ef:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2-NEXT: retq
+ %1 = bitcast <8 x i16> %a to <2 x i64>
+ %2 = bitcast <8 x i16> %b to <2 x i64>
+ %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
+ %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
+ %5 = or <2 x i64> %4, %3
+ %6 = bitcast <2 x i64> %5 to <8 x i16>
+ ret <8 x i16> %6
+}
+
define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_mem_v8i16_i32:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v8i16_i32:
@@ -2190,9 +2217,8 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16:
@@ -2234,9 +2260,8 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32:
@@ -2272,9 +2297,9 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
@@ -2311,9 +2336,8 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
@@ -2357,9 +2381,9 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2: # BB#0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 7e3dc6e294f8..2182ffe0983a 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -7,7 +7,8 @@ target triple = "x86_64-unknown-unknown"
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -22,18 +23,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <16 x i16> %shuffle
@@ -42,16 +41,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -60,16 +59,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -78,16 +77,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -96,16 +95,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -114,16 +113,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -132,16 +131,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -153,7 +152,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -161,7 +161,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX2: # BB#0:
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -175,7 +176,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -196,7 +198,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -216,7 +219,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -236,7 +240,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -256,7 +261,8 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -276,7 +282,8 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -296,7 +303,8 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -313,16 +321,18 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
@@ -331,16 +341,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
ret <16 x i16> %shuffle
@@ -436,7 +448,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <16 x i16> %shuffle
@@ -452,7 +464,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -468,7 +480,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -484,7 +496,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -500,7 +512,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -516,7 +528,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -532,7 +544,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
@@ -541,11 +553,10 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
@@ -559,11 +570,10 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_3
define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
@@ -605,10 +615,10 @@ define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_1
define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
@@ -623,9 +633,10 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3
define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
@@ -640,11 +651,10 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1
define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
@@ -659,11 +669,10 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1
define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
@@ -699,9 +708,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_1
;
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
ret <16 x i16> %shuffle
@@ -721,8 +729,9 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
;
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1,16,17,16,17,20,21,16,17,16,17,16,17,28,29,16,17]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
@@ -1162,7 +1171,8 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1
define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -1426,7 +1436,7 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_z
; AVX1: # BB#0:
; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
@@ -1654,7 +1664,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -2360,6 +2371,24 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
ret <16 x i16> %shuffle
}
+define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 2, i32 16, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
; AVX1: # BB#0:
@@ -2904,8 +2933,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
@@ -3269,13 +3298,15 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16>
define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX1: # BB#0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -3286,15 +3317,15 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
@@ -3312,7 +3343,8 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16>
define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
; ALL: # BB#0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
@@ -3322,18 +3354,76 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
+define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
+; AVX1-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
+ ret <16 x i16> %1
+}
+
+define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
+; AVX1-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,2,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,7,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
+ %2 = bitcast <16 x i16> %1 to <4 x i64>
+ %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ %4 = bitcast <4 x i64> %3 to <16 x i16>
+ ret <16 x i16> %4
+}
+
define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
; ALL: # BB#0:
@@ -3375,11 +3465,51 @@ define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_b
ret <16 x i16> %shuffle16
}
+define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: PR24935:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,5,5,6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,3,2,3,4,5,6,7,8,9,8,9,0,1,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6],xmm2[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,4,5,4,5,10,11,4,5,14,15,12,13,0,1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5],xmm0[6],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR24935:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5,6,7,8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 27, i32 26, i32 1, i32 29, i32 26, i32 23, i32 11, i32 16, i32 1, i32 9, i32 16, i32 28, i32 13, i32 4, i32 0, i32 24>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_mem_v16i16_i32:
; AVX1: # BB#0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -3399,7 +3529,8 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
; AVX1: # BB#0:
; AVX1-NEXT: movswl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -3421,7 +3552,8 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32:
; AVX1: # BB#0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 161a21cef030..b0566812ff7d 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -31,9 +31,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <32 x i8> %shuffle
@@ -50,9 +49,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -69,9 +67,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -88,9 +85,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -107,9 +103,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -126,9 +121,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -145,9 +139,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -164,9 +157,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -183,9 +175,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -202,9 +193,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -221,9 +211,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -240,9 +229,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -259,9 +247,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -278,9 +265,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -299,11 +285,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
-; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -818,7 +801,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <32 x i8> %shuffle
@@ -834,7 +817,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -850,7 +833,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -866,7 +849,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -882,7 +865,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -902,7 +885,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: movl $15, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
@@ -911,12 +894,10 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
@@ -931,12 +912,10 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
@@ -949,20 +928,27 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
}
define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
-; AVX1-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; ALL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; ALL: # BB#0:
+; ALL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+ ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255]
-; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
+; AVX2-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
; AVX2: # BB#0:
-; AVX2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2-NEXT: retq
- %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+ %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 2, i32 32, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %shuffle
}
@@ -970,16 +956,15 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
; AVX1: # BB#0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
; AVX2: # BB#0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32>
ret <32 x i8> %shuffle
@@ -988,21 +973,23 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
; AVX2: # BB#0:
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -1465,7 +1452,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -2006,8 +1993,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
ret <32 x i8> %shuffle
@@ -2057,6 +2043,36 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
ret <32 x i8> %shuffle
}
+define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
+; AVX1-LABEL: PR28136:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,10,10,12,12,14,14,9,9,11,11,13,13,15,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,2,2,4,4,6,6,1,1,3,3,5,5,7,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR28136:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50,i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x i64> %3
+}
+
define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_mem_v32i8_i32:
; AVX1: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 7e33f5f3aa86..181b2e420203 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -112,8 +112,8 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_2200:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_2200:
@@ -153,8 +153,8 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3210:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3210:
@@ -488,7 +488,7 @@ define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_11uu:
; ALL: # BB#0:
-; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
ret <4 x double> %shuffle
@@ -517,9 +517,8 @@ define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3333:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3333:
@@ -558,7 +557,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0001:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -580,7 +579,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -642,8 +641,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1000:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -663,8 +662,8 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_2200:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_2200:
@@ -704,8 +703,8 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3210:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3210:
@@ -801,7 +800,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: retq
@@ -850,15 +849,15 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_0451:
; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0451:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
@@ -894,14 +893,14 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_4015:
; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_4015:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX512VL-NEXT: retq
@@ -1153,7 +1152,7 @@ define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_22uu:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_22uu:
@@ -1172,9 +1171,8 @@ define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3333:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3333:
@@ -1212,20 +1210,10 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
}
define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
-; AVX1-LABEL: insert_mem_and_zero_v4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v4i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: insert_mem_and_zero_v4i64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <4 x i64> undef, i64 %a, i64 0
%shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1235,12 +1223,14 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; AVX1-LABEL: insert_reg_and_zero_v4f64:
; AVX1: # BB#0:
+; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_reg_and_zero_v4f64:
; AVX2: # BB#0:
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-NEXT: retq
@@ -1248,7 +1238,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; AVX512VL-LABEL: insert_reg_and_zero_v4f64:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512VL-NEXT: retq
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1256,20 +1246,10 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
}
define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
-; AVX1-LABEL: insert_mem_and_zero_v4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_and_zero_v4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_and_zero_v4f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovsd (%rdi), %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: insert_mem_and_zero_v4f64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: retq
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1342,8 +1322,7 @@ define <4 x double> @splat_v4f64(<2 x double> %r) {
define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
; AVX1-LABEL: splat_mem_v4i64_from_v2i64:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat_mem_v4i64_from_v2i64:
@@ -1416,6 +1395,28 @@ define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
ret <4 x double> %shuffle
}
+define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
+; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: broadcast_v4f64_0000_from_v2i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: broadcast_v4f64_0000_from_v2i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512VL-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = bitcast <4 x i64> %1 to <4 x double>
+ %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %3
+}
+
define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: bitcast_v4f64_0426:
; AVX1: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index e8b886afd1ae..4aab5cd17009 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -29,8 +29,8 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00000010:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x float> %shuffle
@@ -46,8 +46,8 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00000200:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x float> %shuffle
@@ -63,8 +63,8 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00003000:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x float> %shuffle
@@ -152,8 +152,8 @@ define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00112233:
; AVX1: # BB#0:
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -176,8 +176,8 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_00001111:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x float> %shuffle
@@ -195,18 +195,15 @@ define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_08080808:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_08080808:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x float> %shuffle
@@ -216,7 +213,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_08084c4c:
; ALL: # BB#0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x float> %shuffle
@@ -299,10 +296,10 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_08991abb:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
@@ -338,8 +335,8 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_09ab1def:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
@@ -647,10 +644,10 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_c348cda0:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
@@ -660,21 +657,21 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_f511235a:
; AVX1: # BB#0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3]
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_f511235a:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
-; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
@@ -690,42 +687,29 @@ define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_32103210:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_76547654:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_76547654:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_76547654:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_76543210:
-; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_76543210:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_76543210:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
@@ -783,11 +767,8 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
;
; AVX2-LABEL: PR21138:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,u,u,1,3,5,7>
-; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,7,u,u,u,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <8 x float> %shuffle
@@ -806,10 +787,10 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_ba983210:
; ALL: # BB#0:
-; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
- %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+ %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
@@ -844,15 +825,14 @@ define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_44444444:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_44444444:
; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle
@@ -897,17 +877,11 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_5555uuuu:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8f32_5555uuuu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_5555uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %shuffle
}
@@ -930,15 +904,15 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000010:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00000010:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x i32> %shuffle
@@ -947,15 +921,15 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000200:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00000200:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -964,15 +938,15 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00003000:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00003000:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -1065,8 +1039,8 @@ define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00112233:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1082,15 +1056,15 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00001111:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_00001111:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shuffle
@@ -1113,18 +1087,15 @@ define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08080808:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08080808:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x i32> %shuffle
@@ -1134,7 +1105,7 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08084c4c:
; AVX1: # BB#0:
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08084c4c:
@@ -1252,10 +1223,10 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-LABEL: shuffle_v8i32_08991abb:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
@@ -1265,8 +1236,8 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_091b2d3f:
; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
@@ -1283,15 +1254,15 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_09ab1def:
; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_09ab1def:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
@@ -1768,19 +1739,18 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_6caa87e5:
; AVX1: # BB#0:
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_6caa87e5:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u>
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,0,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
@@ -1796,8 +1766,8 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-LABEL: shuffle_v8i32_32103210:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
@@ -1806,15 +1776,14 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_76547654:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_76547654:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
@@ -1823,14 +1792,14 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_76543210:
; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_76543210:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
@@ -1935,7 +1904,7 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_zuu8zuuc:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX1-NEXT: retq
;
@@ -1950,7 +1919,7 @@ define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_9ubzdefz:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
@@ -2014,15 +1983,14 @@ define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_44444444:
; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_44444444:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -2032,13 +2000,13 @@ define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_5555uuuu:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_5555uuuu:
; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX2-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@@ -2077,9 +2045,9 @@ define <8 x float> @splat_v8f32(<4 x float> %r) {
define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_z0U2zUz6:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_z0U2zUz6:
@@ -2093,9 +2061,9 @@ define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_1U3z5zUU:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_1U3z5zUU:
@@ -2198,7 +2166,7 @@ define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_1:
; ALL: # BB#0: # %entry
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
entry:
%tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
@@ -2213,7 +2181,7 @@ define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_2:
; ALL: # BB#0: # %entry
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
entry:
%tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
@@ -2226,7 +2194,7 @@ define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_3:
; ALL: # BB#0: # %entry
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
entry:
%tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index bef54b05041b..d75184951344 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -4,6 +4,25 @@
target triple = "x86_64-unknown-unknown"
+define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; ALL: # BB#0:
+; ALL-NEXT: vbroadcastss %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; ALL-NEXT: vbroadcastss %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <16 x float> %shuffle
+}
+
define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
; ALL: # BB#0:
@@ -70,6 +89,70 @@ define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz
ret <16 x float> %shuffle
}
+define <16 x float> @shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,3,4,5,6,6]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 12, i32 13>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4, i32 8, i32 8, i32 10, i32 8, i32 12, i32 12, i32 14, i32 12>
+ ret <16 x float> %shuffle
+}
+
+define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12>
+ ret <16 x float> %shuffle
+}
+
+define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastd %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i32> %shuffle
+}
+
+define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; ALL-NEXT: vpbroadcastd %xmm0, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ ret <16 x i32> %shuffle
+}
+
define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
; ALL: # BB#0:
@@ -89,6 +172,21 @@ define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_z
ret <16 x i32> %shuffle
}
+define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) {
+; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
+; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3],zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19],zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35],zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51]
+; AVX512BW-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 20, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 28>
+ ret <16 x i32> %shuffle
+}
+
define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
; ALL: # BB#0:
@@ -172,3 +270,30 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
%v2 = shufflevector <16 x float> %v_a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x float> %v2
}
+
+define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; ALL-NEXT: retq
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+ ret <16 x i32> %c
+}
+
+define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; ALL-NEXT: retq
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
+ ret <16 x i32> %c
+}
+
+define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c(<16 x float> %a, <16 x float> %b) {
+; ALL-LABEL: shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,1],zmm1[0,0],zmm0[4,5],zmm1[4,4],zmm0[8,9],zmm1[8,8],zmm0[12,13],zmm1[12,12]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+ ret <16 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index ab809beb4b48..bcc4ad2d1412 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -1,8 +1,27 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
target triple = "x86_64-unknown-unknown"
+define <32 x i16> @shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i16> %a) {
+; ALL-LABEL: shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; ALL: # BB#0:
+; ALL-NEXT: vpbroadcastw %xmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) {
+; ALL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; ALL: # BB#0:
+; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; ALL-NEXT: vpbroadcastw %xmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+ ret <32 x i16> %c
+}
+
define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) {
; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
; ALL: # BB#0:
@@ -26,8 +45,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = <0,32,1,33,2,34,3,35,8,40,9,41,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
+; ALL-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; ALL-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
@@ -36,9 +54,55 @@ define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i1
define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = <4,36,5,37,6,38,7,39,12,44,13,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
+; ALL-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; ALL-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
}
+
+define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
+; ALL: # BB#0:
+; ALL-NEXT: vpsrld $16, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 34, i32 3, i32 34, i32 5, i32 34, i32 7, i32 34, i32 9, i32 34, i32 11, i32 34, i32 13, i32 34, i32 15, i32 34, i32 17, i32 34, i32 19, i32 34, i32 21, i32 34, i32 23, i32 34, i32 25, i32 34, i32 27, i32 34, i32 29, i32 34, i32 31, i32 34>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
+; ALL: # BB#0:
+; ALL-NEXT: vpslld $16, %zmm0, %zmm0
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 34, i32 0, i32 34, i32 2, i32 34, i32 4, i32 34, i32 6, i32 34, i32 8, i32 34, i32 10, i32 34, i32 12, i32 34, i32 14, i32 34, i32 16, i32 34, i32 18, i32 34, i32 20, i32 34, i32 22, i32 34, i32 24, i32 34, i32 26, i32 34, i32 28, i32 34, i32 30>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
+; ALL: # BB#0:
+; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
+; ALL: # BB#0:
+; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+ ret <32 x i16> %c
+}
+
+define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
+; ALL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
+; ALL: # BB#0:
+; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
+; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
+; ALL-NEXT: retq
+ %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>
+ ret <32 x i16> %c
+}
+
diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
new file mode 100644
index 000000000000..d637c0e2428c
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+
+target triple = "x86_64-unknown-unknown"
+
+define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a) {
+; AVX512F-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512DQ-NEXT: retq
+ %b = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <64 x i8> %b
+}
+
+define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
+; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: retq
+ %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+ ret <64 x i8> %shuffle
+}
+
+define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
+; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: retq
+ %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
+ ret <64 x i8> %shuffle
+}
+
+define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
+; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: retq
+ %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+ ret <64 x i8> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 631968f6afa2..d39961d9c427 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1,9 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
-target triple = "x86_64-unknown-unknown"
-
define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00000000:
; AVX512F: # BB#0:
@@ -18,6 +16,38 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
ret <8 x double> %shuffle
}
+define <8 x double> @shuffle_v8f64_22222222(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_22222222:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_22222222:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_44444444:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_44444444:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x double> %shuffle
+}
+
define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00000010:
; AVX512F: # BB#0:
@@ -140,12 +170,12 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01014545:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01014545:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -187,16 +217,12 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_81a3c5e7:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15]
-; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_81a3c5e7:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0]
-; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1
-; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-32-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x double> %shuffle
@@ -403,14 +429,12 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00014445:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00014445:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -420,14 +444,12 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00204464:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00204464:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x double> %shuffle
@@ -437,14 +459,12 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_03004744:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_03004744:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -454,14 +474,12 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10005444:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10005444:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -471,14 +489,12 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_22006644:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_22006644:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -488,14 +504,12 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_33307774:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_33307774:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -505,14 +519,12 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_32107654:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_32107654:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x double> %shuffle
@@ -522,14 +534,12 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00234467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,4,4,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00234467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,4,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -539,14 +549,12 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00224466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00224466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -556,14 +564,12 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10325476:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,5,4,7,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10325476:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,5,0,4,0,7,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x double> %shuffle
@@ -573,14 +579,12 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_11335577:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,5,5,7,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_11335577:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,5,0,5,0,7,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x double> %shuffle
@@ -590,14 +594,12 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10235467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,3,5,4,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10235467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -607,14 +609,12 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10225466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,2,5,4,6,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10225466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,2,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -777,14 +777,12 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10324567:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10324567:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -794,14 +792,12 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_11334567:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_11334567:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -811,14 +807,12 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_01235467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01235467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -828,14 +822,12 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_01235466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01235466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -879,14 +871,12 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_103245uu:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_103245uu:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -896,14 +886,12 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_1133uu67:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_1133uu67:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -913,14 +901,12 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_0uu354uu:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_0uu354uu:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -930,14 +916,12 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_uuu3uu66:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,u,u,3,u,u,6,6>
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_uuu3uu66:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0>
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -994,6 +978,38 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
ret <8 x i64> %shuffle
}
+define <8 x i64> @shuffle_v8i64_44444444(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_44444444:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_44444444:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_66666666:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_66666666:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+ ret <8 x i64> %shuffle
+}
+
define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00000010:
@@ -1389,14 +1405,12 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00014445:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,4,4,4,5]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00014445:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,4,0,4,0,4,0,5,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x i64> %shuffle
@@ -1406,14 +1420,12 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00204464:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,4,6,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00204464:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,4,0,6,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x i64> %shuffle
@@ -1423,14 +1435,12 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_03004744:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,7,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_03004744:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,7,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1440,14 +1450,12 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10005444:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,5,4,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10005444:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,5,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1457,14 +1465,12 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_22006644:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,6,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_22006644:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1474,14 +1480,12 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_33307774:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,7,7,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_33307774:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,7,0,7,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -1491,14 +1495,12 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_32107654:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,7,6,5,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_32107654:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -1508,14 +1510,12 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00234467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,4,4,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00234467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,4,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1525,14 +1525,12 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00224466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,4,4,6,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00224466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,4,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1542,14 +1540,12 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10325476:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,5,4,7,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10325476:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,5,0,4,0,7,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -1559,14 +1555,12 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_11335577:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,5,5,7,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_11335577:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,5,0,5,0,7,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i64> %shuffle
@@ -1576,14 +1570,12 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10235467:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,3,5,4,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10235467:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1593,14 +1585,12 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10225466:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,2,5,4,6,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10225466:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,2,0,2,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -2096,15 +2086,15 @@ define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_maskz:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-32-NEXT: vpsllvq .LCPI122_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
%res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
@@ -2117,15 +2107,15 @@ define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1>
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshufi64x2_512_mask:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
-; AVX512F-32-NEXT: vpsllvq .LCPI123_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
%y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
%res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
@@ -2154,16 +2144,16 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double>
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-32-NEXT: vpsllvq .LCPI125_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-32-NEXT: retl
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
@@ -2177,16 +2167,16 @@ define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double>
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
-; AVX512F-32-NEXT: vpsllvq .LCPI126_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-32-NEXT: retl
%x1 = load <8 x double>,<8 x double> *%ptr,align 1
%y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
@@ -2207,3 +2197,59 @@ define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) noun
%res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
ret <16 x float> %res
}
+
+define <8 x double> @shuffle_v8f64_23014567(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_23014567:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,4,5,6,7]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_23014567:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,4,5,6,7]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_v8f64_2301uu67(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_2301uu67:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,0,1,6,7]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_2301uu67:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,0,1,6,7]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 undef, i32 undef, i32 6, i32 7>
+ ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_v8f64_2301uuuu(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_2301uuuu:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_2301uuuu:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x double> %1
+}
+
+define <8 x double> @shuffle_v8f64_uuu2301(<8 x double> %a0, <8 x double> %a1) {
+; AVX512F-LABEL: shuffle_v8f64_uuu2301:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_uuu2301:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 0, i32 1>
+ ret <8 x double> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
new file mode 100644
index 000000000000..ac18bba166f1
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
+;
+; Combine tests involving AVX target shuffles
+
+declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8)
+declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8)
+declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8)
+declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8)
+
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
+
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8)
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8)
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8)
+
+define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
+ ret <4 x float> %1
+}
+define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movddup_load:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; ALL-NEXT: retq
+ %1 = load <4 x float>, <4 x float> *%a0
+ %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
+ ret <4 x float> %2
+}
+
+define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movshdup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
+ ret <4 x float> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_movsldup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
+ ret <4 x float> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_unpckh:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
+ ret <4 x float> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_unpckl:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
+ ret <4 x float> %1
+}
+
+define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
+ ret <8 x float> %2
+}
+
+define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_10326u4u:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 undef>)
+ ret <8 x float> %2
+}
+
+define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+ %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ ret <8 x float> %3
+}
+
+define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
+ %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ ret <8 x float> %3
+}
+
+define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
+; ALL-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
+; ALL: # BB#0:
+; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT: retq
+ %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ %2 = shufflevector <4 x double> %1, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %3 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %2, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ ret <4 x double> %3
+}
+
+define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
+ ret <8 x float> %1
+}
+define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movddup_load:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = load <8 x float>, <8 x float> *%a0
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
+ ret <8 x float> %2
+}
+
+define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movshdup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
+ ret <8 x float> %1
+}
+
+define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_movsldup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
+ ret <8 x float> %1
+}
+
+define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_2f64_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>)
+ %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> <i64 2, i64 0>)
+ ret <2 x double> %2
+}
+
+define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_2f64_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; ALL-NEXT: retq
+ %1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>)
+ ret <2 x double> %1
+}
+
+define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f64_identity:
+; ALL: # BB#0:
+; ALL-NEXT: retq
+ %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
+ ret <4 x double> %2
+}
+
+define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f64_movddup:
+; ALL: # BB#0:
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: retq
+ %1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
+ ret <4 x double> %1
+}
+
+define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_4stage:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; ALL-NEXT: retq
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>)
+ %3 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>)
+ %4 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %3, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ ret <4 x float> %4
+}
+
+define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_8f32_4stage:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
+; ALL-NEXT: retq
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>)
+ %3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 0, i32 2, i32 1, i32 3>)
+ %4 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %3, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
+ ret <8 x float> %4
+}
+
+define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
+; ALL-LABEL: combine_vpermilvar_4f32_as_insertps:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
+; ALL-NEXT: retq
+ %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
+ %2 = shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 1, i32 4>
+ ret <4 x float> %2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
new file mode 100644
index 000000000000..a10ba6ccc41e
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -0,0 +1,324 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s
+
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
+
+define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_pslldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <32 x i8> %2
+}
+
+define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_psrldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ ret <32 x i8> %2
+}
+
+define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
+; CHECK-LABEL: combine_pshufb_vpermd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
+; CHECK-NEXT: retq
+ %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
+ %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
+ ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
+; CHECK-LABEL: combine_pshufb_vpermps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
+; CHECK-NEXT: retq
+ %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
+ %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
+ ret <32 x i8> %tmp2
+}
+
+define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
+; CHECK-LABEL: combine_permq_pshufb_as_vperm2i128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
+ %4 = bitcast <32 x i8> %3 to <4 x i64>
+ %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
+ ret <4 x i64> %5
+}
+
+define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
+; CHECK-LABEL: combine_permq_pshufb_as_vpblendd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
+ ret <32 x i8> %3
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastb128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
+ ret <16 x i8> %1
+}
+
+define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastb256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
+ %4 = bitcast <32 x i8> %3 to <8 x i32>
+ %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
+ %6 = bitcast <8 x i32> %5 to <32 x i8>
+ ret <32 x i8> %6
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastw128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+ ret <16 x i8> %1
+}
+
+define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastw256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
+ %4 = bitcast <32 x i8> %3 to <8 x i32>
+ %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
+ %6 = bitcast <8 x i32> %5 to <32 x i8>
+ ret <32 x i8> %6
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastd128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0
+; CHECK-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+ %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
+ ret <16 x i8> %2
+}
+
+define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastd256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
+ %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %3
+}
+
+define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastq128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ ret <16 x i8> %1
+}
+
+define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastq256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+ %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %3
+}
+
+define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
+; CHECK-LABEL: combine_pshufb_as_vpbroadcastss128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = bitcast <4 x float> %a to <16 x i8>
+ %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ ret <4 x float> %3
+}
+
+define <8 x float> @combine_permd_as_vpbroadcastss256(<4 x float> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastss256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
+ ret <8 x float> %2
+}
+
+define <4 x double> @combine_permd_as_vpbroadcastsd256(<2 x double> %a) {
+; CHECK-LABEL: combine_permd_as_vpbroadcastsd256:
+; CHECK: # BB#0:
+; CHECK-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %2 = bitcast <4 x double> %1 to <8 x float>
+ %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+ %4 = bitcast <8 x float> %3 to <4 x double>
+ ret <4 x double> %4
+}
+
+define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
+; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
+ %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
+ ret <16 x i8> %2
+}
+
+define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
+; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
+ %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
+ ret <32 x i8> %2
+}
+
+define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
+; CHECK-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = bitcast <4 x float> %1 to <16 x i8>
+ %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
+ %4 = bitcast <16 x i8> %3 to <4 x float>
+ ret <4 x float> %4
+}
+
+define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
+; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
+ %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
+ ret <8 x float> %2
+}
+
+define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
+; CHECK-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
+ %2 = bitcast <4 x double> %1 to <8 x float>
+ %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+ %4 = bitcast <8 x float> %3 to <4 x double>
+ ret <4 x double> %4
+}
+
+define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
+; CHECK-LABEL: combine_permd_as_permq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1]
+; CHECK-NEXT: retq
+ %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
+ ret <8 x i32> %1
+}
+
+define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
+; CHECK-LABEL: combine_permps_as_permpd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
+; CHECK-NEXT: retq
+ %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
+ ret <8 x float> %1
+}
+
+define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pslldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_psrldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshuflw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshufhw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <32 x i8> %res0
+}
+
+define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_not_as_pshufw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <32 x i8> %res1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
new file mode 100644
index 000000000000..baf1054170ba
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s
+
+declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
+declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_permvar_8f64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
+ ret <8 x double> %res1
+}
+define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8f64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 %m)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 %m)
+ ret <8 x double> %res1
+}
+
+define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
+; CHECK-LABEL: combine_permvar_8i64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
+ ret <8 x i64> %res1
+}
+define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8i64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 %m)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 %m)
+ ret <8 x i64> %res1
+}
+
+define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8f64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
+ ret <8 x double> %res1
+}
+define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_vpermt2var_8f64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
+ %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
+ ret <8 x double> %res1
+}
+
+define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+ ret <8 x double> %res0
+}
+define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8f64_movddup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %x0 = load <8 x double>, <8 x double> *%p0
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+ ret <8 x double> %res0
+}
+define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_vpermt2var_8f64_movddup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
+ ret <8 x double> %res0
+}
+
+define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
+; CHECK-LABEL: combine_vpermt2var_8i64_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
+ ret <8 x i64> %res1
+}
+define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
+; CHECK-LABEL: combine_vpermt2var_8i64_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,5,12,3,10,1,8]
+; CHECK-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
+ ret <8 x i64> %res1
+}
+
+define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
+ ret <16 x float> %res1
+}
+define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; CHECK-NEXT: vpermt2ps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
+ ret <16 x float> %res1
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
+ %x0 = load <16 x float>, <16 x float> *%p0
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ ret <16 x float> %res0
+}
+
+define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
+; CHECK-LABEL: combine_vpermt2var_16i32_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
+ ret <16 x i32> %res1
+}
+define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
+; CHECK-LABEL: combine_vpermt2var_16i32_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; CHECK-NEXT: vpermt2d %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
+ %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
+ ret <16 x i32> %res1
+}
+
+define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
+; CHECK-LABEL: combine_vpermt2var_32i16_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 -1)
+ ret <32 x i16> %res1
+}
+define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) {
+; CHECK-LABEL: combine_vpermt2var_32i16_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqu16 {{.*#+}} zmm1 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; CHECK-NEXT: vpermt2w %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
+ %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
+ ret <32 x i16> %res1
+}
+
+define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
+; CHECK-LABEL: combine_pshufb_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: retq
+ %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
+ %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
+ ret <64 x i8> %res1
+}
+define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
+; CHECK-LABEL: combine_pshufb_identity_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; CHECK-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
+; CHECK-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
+; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
+ %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
+ ret <64 x i8> %res1
+}
+
+define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastw512:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %1
+}
+
+define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastd512:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1)
+ ret <16 x i32> %1
+}
+
+define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
+; CHECK-LABEL: combine_permvar_as_vpbroadcastq512:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
+; CHECK-LABEL: combine_permvar_8i64_as_permq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x i64> %x1, i8 -1)
+ ret <8 x i64> %1
+}
+define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8i64_as_permq_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x i64> %x1, i8 %m)
+ ret <8 x i64> %1
+}
+
+define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
+; CHECK-LABEL: combine_permvar_8f64_as_permpd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: retq
+ %1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
+ ret <8 x double> %1
+}
+define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
+; CHECK-LABEL: combine_permvar_8f64_as_permpd_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 %m)
+ ret <8 x double> %1
+}
+
+define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
+; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
+; CHECK-NEXT: retq
+ %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
+ ret <16 x float> %res1
+}
+
+define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pslldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> undef, i64 -1)
+ ret <64 x i8> %res0
+}
+define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
+; CHECK-LABEL: combine_pshufb_as_pslldq_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> zeroinitializer, i64 %m)
+ ret <64 x i8> %res0
+}
+
+define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
+; CHECK-LABEL: combine_pshufb_as_psrldq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> undef, i64 -1)
+ ret <64 x i8> %res0
+}
+define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
+; CHECK-LABEL: combine_pshufb_as_psrldq_mask:
+; CHECK: # BB#0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> zeroinitializer, i64 %m)
+ ret <64 x i8> %res0
+}
+
+define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
+; CHECK-LABEL: combine_permvar_as_pshuflw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %res0
+}
+
+define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshufhw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %res0
+}
+
+define <32 x i16> @combine_pshufb_as_pshufw(<32 x i16> %a0) {
+; CHECK-LABEL: combine_pshufb_as_pshufw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
+; CHECK-NEXT: retq
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %res1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
new file mode 100644
index 000000000000..85e1071a35aa
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
+;
+; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB)
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @combine_vpshufb_zero(<16 x i8> %a0) {
+; SSE-LABEL: combine_vpshufb_zero:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vpshufb_zero:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <16 x i8> %res2
+}
+
+define <16 x i8> @combine_vpshufb_movq(<16 x i8> %a0) {
+; SSE-LABEL: combine_vpshufb_movq:
+; SSE: # BB#0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vpshufb_movq:
+; AVX: # BB#0:
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>)
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>)
+ ret <16 x i8> %res1
+}
+
+define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) {
+; SSE-LABEL: combine_pshufb_movddup:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_movddup:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a0 to <16 x i8>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x float> %4
+}
+
+define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) {
+; SSE-LABEL: combine_pshufb_movshdup:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_movshdup:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a0 to <16 x i8>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %4
+}
+
+define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
+; SSE-LABEL: combine_pshufb_movsldup:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_movsldup:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
+; AVX-NEXT: retq
+ %1 = bitcast <4 x float> %a0 to <16 x i8>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %4
+}
+
+define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_pshufb_palignr:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_palignr:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_pslldq:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_pslldq:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+ %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_psrldq:
+; SSE: # BB#0:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_psrldq:
+; AVX: # BB#0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_pslldq:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_pslldq:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_psrldq:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_psrldq:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_pshuflw:
+; SSE: # BB#0:
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_pshuflw:
+; AVX: # BB#0:
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_pshufhw:
+; SSE: # BB#0:
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_pshufhw:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_not_as_pshufw:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_not_as_pshufw:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; AVX-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+ %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
+ ret <16 x i8> %res1
+}
+
+define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
+; SSE: # BB#0:
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
+; SSE: # BB#0:
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
+; AVX: # BB#0:
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_unpckl_arg0_pshufb:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_unpckl_arg0_pshufb:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>)
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: combine_unpckl_arg1_pshufb:
+; SSE: # BB#0:
+; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_unpckl_arg1_pshufb:
+; AVX: # BB#0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
+; AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>)
+ ret <16 x i8> %2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
new file mode 100644
index 000000000000..76226065fd7c
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s
+
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
+
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) {
+; CHECK-LABEL: combine_vpermil2pd_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> <i64 2, i64 0>, i8 0)
+ %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> <i64 2, i64 0>, i8 0)
+ ret <2 x double> %res1
+}
+
+define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: combine_vpermil2pd256_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
+ %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
+ ret <4 x double> %res1
+}
+
+define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
+ %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
+ ret <4 x float> %res1
+}
+
+define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps256_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
+ %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
+ ret <8 x float> %res1
+}
+
+define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps256_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>, i8 2)
+ ret <8 x float> %res0
+}
+
+define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x float> %a1) {
+; CHECK-LABEL: combine_vpermil2ps_blend_with_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: retq
+ %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 8, i32 1, i32 2, i32 3>, i8 2)
+ ret <4 x float> %res0
+}
+
+define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_identity:
+; CHECK: # BB#0:
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
+ %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+ ret <16 x i8> %res1
+}
+
+define <16 x i8> @combine_vpperm_zero(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
+ %res2 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res1, <16 x i8> undef, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
+ ret <16 x i8> %res2
+}
+
+define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_identity_bitcast:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %mask = bitcast <2 x i64> <i64 1084818905618843912, i64 506097522914230528> to <16 x i8>
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %mask)
+ %res1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %res0, <16 x i8> undef, <16 x i8> %mask)
+ %res2 = bitcast <16 x i8> %res1 to <2 x i64>
+ %res3 = add <2 x i64> %res2, <i64 1084818905618843912, i64 506097522914230528>
+ %res4 = bitcast <2 x i64> %res3 to <16 x i8>
+ ret <16 x i8> %res4
+}
+
+define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_as_blend_with_zero:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 0, i8 1, i8 128, i8 129, i8 4, i8 5, i8 6, i8 7, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_vpperm_as_unary_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_as_unary_unpckhwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
+ ret <16 x i8> %res0
+}
+
+define <16 x i8> @combine_vpperm_as_unpckhwd(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: combine_vpperm_as_unpckhwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
+ ret <16 x i8> %res0
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 75ce9753525b..266a3658eda9 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -96,10 +96,15 @@ define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_pshufd6:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_pshufd6:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_pshufd6:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
+; AVX2-NEXT: retq
entry:
%b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
%c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
@@ -1783,13 +1788,13 @@ define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
; SSE-LABEL: combine_test22:
; SSE: # BB#0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movhpd (%rsi), %xmm0
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test22:
; AVX: # BB#0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
; Current AVX2 lowering of this is still awful, not adding a test case.
%1 = load <2 x float>, <2 x float>* %a, align 8
@@ -1798,6 +1803,29 @@ define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
ret <8 x float> %3
}
+; PR22359
+define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
+; SSE-LABEL: combine_test23:
+; SSE: # BB#0:
+; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_test23:
+; AVX: # BB#0:
+; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
+ %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
+ %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
+ store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
+ store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
+ ret void
+}
+
; Check some negative cases.
; FIXME: Do any of these really make sense? Are they redundant with the above tests?
@@ -2412,7 +2440,7 @@ define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
;
; AVX-LABEL: combine_undef_input_test9:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -2603,7 +2631,7 @@ define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
;
; AVX-LABEL: combine_undef_input_test19:
; AVX: # BB#0:
-; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
%2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -2636,15 +2664,16 @@ define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_unneeded_subvector1:
; AVX2: # BB#0:
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
%b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
@@ -2795,6 +2824,50 @@ define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %d
}
+; FIXME: Failed to recognise that the VMOVSD has already zero'd the upper element
+define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
+; SSE2-LABEL: combine_scalar_load_with_blend_with_zero:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: movaps %xmm0, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_scalar_load_with_blend_with_zero:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: movaps %xmm0, (%rsi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_scalar_load_with_blend_with_zero:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: xorpd %xmm1, %xmm1
+; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE41-NEXT: movapd %xmm1, (%rsi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
+; AVX: # BB#0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT: vmovapd %xmm0, (%rsi)
+; AVX-NEXT: retq
+ %1 = load double, double* %a0, align 8
+ %2 = insertelement <2 x double> undef, double %1, i32 0
+ %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
+ %4 = bitcast <2 x double> %3 to <4 x float>
+ %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+ store <4 x float> %5, <4 x float>* %a1, align 16
+ ret void
+}
+
define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: PR22377:
; SSE: # BB#0: # %entry
@@ -2898,8 +2971,8 @@ define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
; AVX2-LABEL: PR22412:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; AVX2-NEXT: retq
entry:
%s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
index 548de4ce6ea3..fc4652eca55d 100644
--- a/test/CodeGen/X86/vector-shuffle-sse1.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -91,6 +91,22 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0145:
+; SSE1: # BB#0:
+; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_6723:
+; SSE1: # BB#0:
+; SSE1-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_4zzz:
@@ -194,7 +210,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: xorps %xmm2, %xmm2
-; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE1-NEXT: movaps %xmm1, %xmm0
; SSE1-NEXT: retq
@@ -215,8 +231,8 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: xorps %xmm2, %xmm2
-; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -234,3 +250,21 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
%shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
+; SSE1-LABEL: shuffle_mem_v4f32_0145:
+; SSE1: # BB#0:
+; SSE1-NEXT: movhps (%rdi), %xmm0
+; SSE1-NEXT: retq
+ %b = load <4 x float>, <4 x float>* %pb, align 16
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) {
+; SSE1-LABEL: shuffle_mem_v4f32_6723:
+; SSE1: # BB#0:
+; SSE1-NEXT: movlps 8(%rdi), %xmm0
+; SSE1-NEXT: retq
+ %b = load <4 x float>, <4 x float>* %pb, align 16
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-sse41.ll b/test/CodeGen/X86/vector-shuffle-sse41.ll
new file mode 100644
index 000000000000..be9a4b950778
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse41.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
+; SSE41-LABEL: blend_packusdw:
+; SSE41: # BB#0:
+; SSE41-NEXT: packusdw %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_packusdw:
+; AVX: # BB#0:
+; AVX-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+ %p1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
+ %s0 = shufflevector <8 x i16> %p0, <8 x i16> %p1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i16> %s0
+}
+
+define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packuswb:
+; SSE41: # BB#0:
+; SSE41-NEXT: packuswb %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_packuswb:
+; AVX: # BB#0:
+; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+ %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+ %s0 = shufflevector <16 x i8> %p0, <16 x i8> %p1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i8> %s0
+}
+
+define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packusdw_packuswb:
+; SSE41: # BB#0:
+; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: packuswb %xmm3, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: blend_packusdw_packuswb:
+; AVX: # BB#0:
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+ %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+ %b1 = bitcast <16 x i8> %p1 to <8 x i16>
+ %s0 = shufflevector <8 x i16> %p0, <8 x i16> %b1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i16> %s0
+}
+
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index a387f894a067..1c128645ad14 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -13,11 +13,11 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; VL_BW_DQ-LABEL: shuf2i1_1_0:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
@@ -35,14 +35,14 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; VL_BW_DQ-LABEL: shuf2i1_1_2:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: movb $1, %al
; VL_BW_DQ-NEXT: kmovb %eax, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
-; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0
+; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
@@ -59,11 +59,11 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -74,13 +74,13 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@@ -91,7 +91,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <8 x i64> %a, %a1
@@ -105,14 +105,14 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512F: # BB#0:
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
-; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@@ -125,7 +125,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
; VL_BW_DQ-NEXT: vpslld $31, %zmm1, %zmm0
-; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%a2 = icmp eq <16 x i32> %a, %a1
@@ -162,15 +162,14 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
-; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: retq
;
@@ -178,10 +177,10 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: kmovb %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
-; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0
+; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -192,15 +191,16 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
@@ -211,8 +211,9 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
@@ -223,23 +224,25 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: kmovb %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -250,15 +253,16 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
@@ -269,8 +273,9 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
@@ -281,15 +286,16 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
@@ -300,8 +306,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
@@ -312,19 +319,18 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
; AVX512F: # BB#0:
-; AVX512F-NEXT: movzbl %dil, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; AVX512F-NEXT: movb $51, %al
-; AVX512F-NEXT: movzbl %al, %eax
; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
@@ -337,8 +343,9 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
@@ -352,13 +359,14 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
-; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
@@ -367,11 +375,12 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
-; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
+; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
-; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
%c1 = bitcast <8 x i1>%c to i8
@@ -383,11 +392,13 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
; AVX512F: # BB#0:
; AVX512F-NEXT: kmovw %edi, %k1
-; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; AVX512F-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
@@ -396,8 +407,9 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
+; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: kmovw %k0, %eax
+; VL_BW_DQ-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; VL_BW_DQ-NEXT: retq
%b = bitcast i16 %a to <16 x i1>
%c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
@@ -417,8 +429,22 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: .Ltmp2:
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
-; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: movb $0, (%rsp)
+; AVX512F-NEXT: subq $96, %rsp
+; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
; AVX512F-NEXT: movl (%rsp), %ecx
; AVX512F-NEXT: movq %rcx, %rax
; AVX512F-NEXT: shlq $32, %rax
@@ -429,11 +455,15 @@ define i64 @shuf64i1_zero(i64 %a) {
;
; VL_BW_DQ-LABEL: shuf64i1_zero:
; VL_BW_DQ: # BB#0:
-; VL_BW_DQ-NEXT: kxorq %k0, %k0, %k0
+; VL_BW_DQ-NEXT: kmovq %rdi, %k0
+; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0
+; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
+; VL_BW_DQ-NEXT: vpsllw $7, %zmm0, %zmm0
+; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovq %k0, %rax
; VL_BW_DQ-NEXT: retq
%b = bitcast i64 %a to <64 x i1>
- %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer
+ %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
%d = bitcast <64 x i1> %c to i64
ret i64 %d
}
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
new file mode 100644
index 000000000000..d130e7ff00b2
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -0,0 +1,1321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Unary shuffle indices from registers
+;
+
+define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
+; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
+; SSE: # BB#0:
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
+; AVX: # BB#0:
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <2 x double> %x, i64 %i0
+ %x1 = extractelement <2 x double> %x, i64 %i1
+ %r0 = insertelement <2 x double> undef, double %x0, i32 0
+ %r1 = insertelement <2 x double> %r0, double %x1, i32 1
+ ret <2 x double> %r1
+}
+
+define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
+; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
+; SSE: # BB#0:
+; SSE-NEXT: movslq %edi, %rax
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movslq %esi, %rcx
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %esi, %rcx
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <2 x i64> %x, i32 %i0
+ %x1 = extractelement <2 x i64> %x, i32 %i1
+ %r0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <2 x i64> %r0, i64 %x1, i32 1
+ ret <2 x i64> %r1
+}
+
+define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
+; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movslq %edi, %rax
+; SSE2-NEXT: movslq %esi, %rsi
+; SSE2-NEXT: movslq %edx, %rdx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movslq %ecx, %rcx
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movslq %edi, %rax
+; SSSE3-NEXT: movslq %esi, %rsi
+; SSSE3-NEXT: movslq %edx, %rdx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movslq %ecx, %rcx
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movslq %edi, %rax
+; SSE41-NEXT: movslq %esi, %rsi
+; SSE41-NEXT: movslq %edx, %rdx
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movslq %ecx, %rcx
+; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: movslq %esi, %rsi
+; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <4 x float> %x, i32 %i0
+ %x1 = extractelement <4 x float> %x, i32 %i1
+ %x2 = extractelement <4 x float> %x, i32 %i2
+ %x3 = extractelement <4 x float> %x, i32 %i3
+ %r0 = insertelement <4 x float> undef, float %x0, i32 0
+ %r1 = insertelement <4 x float> %r0, float %x1, i32 1
+ %r2 = insertelement <4 x float> %r1, float %x2, i32 2
+ %r3 = insertelement <4 x float> %r2, float %x3, i32 3
+ ret <4 x float> %r3
+}
+
+define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
+; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movslq %edi, %rax
+; SSE2-NEXT: movslq %esi, %rsi
+; SSE2-NEXT: movslq %edx, %rdx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movslq %ecx, %rcx
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movslq %edi, %rax
+; SSSE3-NEXT: movslq %esi, %rsi
+; SSSE3-NEXT: movslq %edx, %rdx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movslq %ecx, %rcx
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movslq %edi, %rax
+; SSE41-NEXT: movslq %esi, %rsi
+; SSE41-NEXT: movslq %edx, %rdx
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movslq %ecx, %rcx
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, -24(%rsp,%rsi,4), %xmm0
+; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0
+; SSE41-NEXT: pinsrd $3, -24(%rsp,%rcx,4), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: movslq %esi, %rsi
+; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x0 = extractelement <4 x i32> %x, i32 %i0
+ %x1 = extractelement <4 x i32> %x, i32 %i1
+ %x2 = extractelement <4 x i32> %x, i32 %i2
+ %x3 = extractelement <4 x i32> %x, i32 %i3
+ %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+ %r1 = insertelement <4 x i32> %r0, i32 %x1, i32 1
+ %r2 = insertelement <4 x i32> %r1, i32 %x2, i32 2
+ %r3 = insertelement <4 x i32> %r2, i32 %x3, i32 3
+ ret <4 x i32> %r3
+}
+
+define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
+; SSE2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: movswq %di, %rax
+; SSE2-NEXT: movswq %si, %rsi
+; SSE2-NEXT: movswq %dx, %rdx
+; SSE2-NEXT: movswq %cx, %r10
+; SSE2-NEXT: movswq %r8w, %r11
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movswq %r9w, %r8
+; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: movswq {{[0-9]+}}(%rsp), %rdi
+; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%r11,2), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: movd %edi, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movd %esi, %xmm1
+; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: movswq %di, %rax
+; SSSE3-NEXT: movswq %si, %rsi
+; SSSE3-NEXT: movswq %dx, %rdx
+; SSSE3-NEXT: movswq %cx, %r10
+; SSSE3-NEXT: movswq %r8w, %r11
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movswq %r9w, %r8
+; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rcx
+; SSSE3-NEXT: movswq {{[0-9]+}}(%rsp), %rdi
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %edi
+; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%r11,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: movd %edi, %xmm1
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: movd %esi, %xmm1
+; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: movswq %di, %rax
+; SSE41-NEXT: movswq %si, %rbx
+; SSE41-NEXT: movswq %dx, %r11
+; SSE41-NEXT: movswq %cx, %r10
+; SSE41-NEXT: movswq %r8w, %rdi
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movswq %r9w, %rcx
+; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: movswq {{[0-9]+}}(%rsp), %rsi
+; SSE41-NEXT: movzwl -16(%rsp,%rdx,2), %edx
+; SSE41-NEXT: movzwl -16(%rsp,%rsi,2), %esi
+; SSE41-NEXT: movzwl -16(%rsp,%rax,2), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: pinsrw $1, -16(%rsp,%rbx,2), %xmm0
+; SSE41-NEXT: pinsrw $2, -16(%rsp,%r11,2), %xmm0
+; SSE41-NEXT: pinsrw $3, -16(%rsp,%r10,2), %xmm0
+; SSE41-NEXT: pinsrw $4, -16(%rsp,%rdi,2), %xmm0
+; SSE41-NEXT: pinsrw $5, -16(%rsp,%rcx,2), %xmm0
+; SSE41-NEXT: pinsrw $6, %edx, %xmm0
+; SSE41-NEXT: pinsrw $7, %esi, %xmm0
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movswq %di, %r10
+; AVX-NEXT: movswq %si, %r11
+; AVX-NEXT: movswq %dx, %r14
+; AVX-NEXT: movswq %cx, %rcx
+; AVX-NEXT: movswq %r8w, %rdi
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movswq %r9w, %rax
+; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: movswq {{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; AVX-NEXT: movzwl -24(%rsp,%rdx,2), %edx
+; AVX-NEXT: movzwl -24(%rsp,%r10,2), %ebx
+; AVX-NEXT: vmovd %ebx, %xmm0
+; AVX-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $2, -24(%rsp,%r14,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $4, -24(%rsp,%rdi,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r14
+; AVX-NEXT: retq
+ %x0 = extractelement <8 x i16> %x, i16 %i0
+ %x1 = extractelement <8 x i16> %x, i16 %i1
+ %x2 = extractelement <8 x i16> %x, i16 %i2
+ %x3 = extractelement <8 x i16> %x, i16 %i3
+ %x4 = extractelement <8 x i16> %x, i16 %i4
+ %x5 = extractelement <8 x i16> %x, i16 %i5
+ %x6 = extractelement <8 x i16> %x, i16 %i6
+ %x7 = extractelement <8 x i16> %x, i16 %i7
+ %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+ %r1 = insertelement <8 x i16> %r0, i16 %x1, i32 1
+ %r2 = insertelement <8 x i16> %r1, i16 %x2, i32 2
+ %r3 = insertelement <8 x i16> %r2, i16 %x3, i32 3
+ %r4 = insertelement <8 x i16> %r3, i16 %x4, i32 4
+ %r5 = insertelement <8 x i16> %r4, i16 %x5, i32 5
+ %r6 = insertelement <8 x i16> %r5, i16 %x6, i32 6
+ %r7 = insertelement <8 x i16> %r6, i16 %x7, i32 7
+ ret <8 x i16> %r7
+}
+
+define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %i0, i8 %i1, i8 %i2, i8 %i3, i8 %i4, i8 %i5, i8 %i6, i8 %i7, i8 %i8, i8 %i9, i8 %i10, i8 %i11, i8 %i12, i8 %i13, i8 %i14, i8 %i15) nounwind {
+; SSE2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT: movzbl (%r10,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm15
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm8
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: movsbq %dl, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm10
+; SSE2-NEXT: movsbq %dil, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm11
+; SSE2-NEXT: movsbq %r8b, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm7
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm12
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm13
+; SSE2-NEXT: movsbq %cl, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm14
+; SSE2-NEXT: movsbq %sil, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movsbq %r9b, %rax
+; SSE2-NEXT: movzbl (%rax,%r11), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r11
+; SSSE3-NEXT: movzbl (%r10,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm15
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm8
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm9
+; SSSE3-NEXT: movsbq %dl, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm10
+; SSSE3-NEXT: movsbq %dil, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm11
+; SSSE3-NEXT: movsbq %r8b, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm7
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm12
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm13
+; SSSE3-NEXT: movsbq %cl, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm6
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm14
+; SSSE3-NEXT: movsbq %sil, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm5
+; SSSE3-NEXT: movsbq {{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm4
+; SSSE3-NEXT: movsbq %r9b, %rax
+; SSSE3-NEXT: movzbl (%rax,%r11), %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pushq %rbp
+; SSE41-NEXT: pushq %r15
+; SSE41-NEXT: pushq %r14
+; SSE41-NEXT: pushq %r13
+; SSE41-NEXT: pushq %r12
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: movsbq %dil, %r15
+; SSE41-NEXT: movsbq %sil, %r14
+; SSE41-NEXT: movsbq %dl, %r11
+; SSE41-NEXT: movsbq %cl, %r10
+; SSE41-NEXT: movsbq %r8b, %r8
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movsbq %r9b, %r9
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r12
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r13
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rbx
+; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSE41-NEXT: movzbl (%r15,%rax), %ecx
+; SSE41-NEXT: movd %ecx, %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r15
+; SSE41-NEXT: pinsrb $1, (%r14,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r14
+; SSE41-NEXT: pinsrb $2, (%r11,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r11
+; SSE41-NEXT: pinsrb $3, (%r10,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; SSE41-NEXT: pinsrb $4, (%r8,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT: pinsrb $5, (%r9,%rax), %xmm0
+; SSE41-NEXT: movsbq {{[0-9]+}}(%rsp), %rdx
+; SSE41-NEXT: movzbl (%r12,%rax), %esi
+; SSE41-NEXT: movzbl (%r13,%rax), %edi
+; SSE41-NEXT: movzbl (%rbp,%rax), %ebp
+; SSE41-NEXT: movzbl (%rbx,%rax), %ebx
+; SSE41-NEXT: movzbl (%r15,%rax), %r8d
+; SSE41-NEXT: movzbl (%r14,%rax), %r9d
+; SSE41-NEXT: movzbl (%r11,%rax), %r11d
+; SSE41-NEXT: movzbl (%r10,%rax), %r10d
+; SSE41-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE41-NEXT: movzbl (%rdx,%rax), %eax
+; SSE41-NEXT: pinsrb $6, %esi, %xmm0
+; SSE41-NEXT: pinsrb $7, %edi, %xmm0
+; SSE41-NEXT: pinsrb $8, %ebp, %xmm0
+; SSE41-NEXT: pinsrb $9, %ebx, %xmm0
+; SSE41-NEXT: pinsrb $10, %r8d, %xmm0
+; SSE41-NEXT: pinsrb $11, %r9d, %xmm0
+; SSE41-NEXT: pinsrb $12, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $13, %r10d, %xmm0
+; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
+; SSE41-NEXT: pinsrb $15, %eax, %xmm0
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: popq %r12
+; SSE41-NEXT: popq %r13
+; SSE41-NEXT: popq %r14
+; SSE41-NEXT: popq %r15
+; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX-NEXT: movsbq %dil, %r10
+; AVX-NEXT: movsbq %sil, %r11
+; AVX-NEXT: movsbq %dl, %r14
+; AVX-NEXT: movsbq %cl, %r15
+; AVX-NEXT: movsbq %r8b, %r8
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movsbq %r9b, %r9
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r12
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r13
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rdi
+; AVX-NEXT: movzbl (%r10,%rdi), %eax
+; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT: vpinsrb $1, (%r11,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r11
+; AVX-NEXT: vpinsrb $2, (%r14,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r14
+; AVX-NEXT: vpinsrb $3, (%r15,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r15
+; AVX-NEXT: vpinsrb $4, (%r8,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %r8
+; AVX-NEXT: vpinsrb $5, (%r9,%rdi), %xmm0, %xmm0
+; AVX-NEXT: movsbq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: movzbl (%r12,%rdi), %edx
+; AVX-NEXT: movzbl (%r13,%rdi), %ebx
+; AVX-NEXT: movzbl (%rbp,%rdi), %ebp
+; AVX-NEXT: movzbl (%rcx,%rdi), %ecx
+; AVX-NEXT: movzbl (%r10,%rdi), %eax
+; AVX-NEXT: movzbl (%r11,%rdi), %r9d
+; AVX-NEXT: movzbl (%r14,%rdi), %r10d
+; AVX-NEXT: movzbl (%r15,%rdi), %r11d
+; AVX-NEXT: movzbl (%r8,%rdi), %r8d
+; AVX-NEXT: movzbl (%rsi,%rdi), %esi
+; AVX-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %ebx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, %ebp, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, %r10d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %r11d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, %r8d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %esi, %xmm0, %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r12
+; AVX-NEXT: popq %r13
+; AVX-NEXT: popq %r14
+; AVX-NEXT: popq %r15
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: retq
+ %x0 = extractelement <16 x i8> %x, i8 %i0
+ %x1 = extractelement <16 x i8> %x, i8 %i1
+ %x2 = extractelement <16 x i8> %x, i8 %i2
+ %x3 = extractelement <16 x i8> %x, i8 %i3
+ %x4 = extractelement <16 x i8> %x, i8 %i4
+ %x5 = extractelement <16 x i8> %x, i8 %i5
+ %x6 = extractelement <16 x i8> %x, i8 %i6
+ %x7 = extractelement <16 x i8> %x, i8 %i7
+ %x8 = extractelement <16 x i8> %x, i8 %i8
+ %x9 = extractelement <16 x i8> %x, i8 %i9
+ %x10 = extractelement <16 x i8> %x, i8 %i10
+ %x11 = extractelement <16 x i8> %x, i8 %i11
+ %x12 = extractelement <16 x i8> %x, i8 %i12
+ %x13 = extractelement <16 x i8> %x, i8 %i13
+ %x14 = extractelement <16 x i8> %x, i8 %i14
+ %x15 = extractelement <16 x i8> %x, i8 %i15
+ %r0 = insertelement <16 x i8> undef, i8 %x0 , i32 0
+ %r1 = insertelement <16 x i8> %r0 , i8 %x1 , i32 1
+ %r2 = insertelement <16 x i8> %r1 , i8 %x2 , i32 2
+ %r3 = insertelement <16 x i8> %r2 , i8 %x3 , i32 3
+ %r4 = insertelement <16 x i8> %r3 , i8 %x4 , i32 4
+ %r5 = insertelement <16 x i8> %r4 , i8 %x5 , i32 5
+ %r6 = insertelement <16 x i8> %r5 , i8 %x6 , i32 6
+ %r7 = insertelement <16 x i8> %r6 , i8 %x7 , i32 7
+ %r8 = insertelement <16 x i8> %r7 , i8 %x8 , i32 8
+ %r9 = insertelement <16 x i8> %r8 , i8 %x9 , i32 9
+ %r10 = insertelement <16 x i8> %r9 , i8 %x10, i32 10
+ %r11 = insertelement <16 x i8> %r10, i8 %x11, i32 11
+ %r12 = insertelement <16 x i8> %r11, i8 %x12, i32 12
+ %r13 = insertelement <16 x i8> %r12, i8 %x13, i32 13
+ %r14 = insertelement <16 x i8> %r13, i8 %x14, i32 14
+ %r15 = insertelement <16 x i8> %r14, i8 %x15, i32 15
+ ret <16 x i8> %r15
+}
+
+;
+; Unary shuffle indices from memory
+;
+
+define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
+; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movslq (%rdi), %rax
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movslq 4(%rdi), %rcx
+; SSE2-NEXT: movslq 8(%rdi), %rdx
+; SSE2-NEXT: movslq 12(%rdi), %rsi
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movslq (%rdi), %rax
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movslq 4(%rdi), %rcx
+; SSSE3-NEXT: movslq 8(%rdi), %rdx
+; SSSE3-NEXT: movslq 12(%rdi), %rsi
+; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movslq (%rdi), %rax
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movslq 4(%rdi), %rcx
+; SSE41-NEXT: movslq 8(%rdi), %rdx
+; SSE41-NEXT: movslq 12(%rdi), %rsi
+; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: pinsrd $1, -24(%rsp,%rcx,4), %xmm0
+; SSE41-NEXT: pinsrd $2, -24(%rsp,%rdx,4), %xmm0
+; SSE41-NEXT: pinsrd $3, -24(%rsp,%rsi,4), %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq (%rdi), %rax
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq 4(%rdi), %rcx
+; AVX-NEXT: movslq 8(%rdi), %rdx
+; AVX-NEXT: movslq 12(%rdi), %rsi
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %p0 = getelementptr inbounds i32, i32* %i, i64 0
+ %p1 = getelementptr inbounds i32, i32* %i, i64 1
+ %p2 = getelementptr inbounds i32, i32* %i, i64 2
+ %p3 = getelementptr inbounds i32, i32* %i, i64 3
+ %i0 = load i32, i32* %p0, align 4
+ %i1 = load i32, i32* %p1, align 4
+ %i2 = load i32, i32* %p2, align 4
+ %i3 = load i32, i32* %p3, align 4
+ %x0 = extractelement <4 x i32> %x, i32 %i0
+ %x1 = extractelement <4 x i32> %x, i32 %i1
+ %x2 = extractelement <4 x i32> %x, i32 %i2
+ %x3 = extractelement <4 x i32> %x, i32 %i3
+ %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+ %r1 = insertelement <4 x i32> %r0, i32 %x1, i32 1
+ %r2 = insertelement <4 x i32> %r1, i32 %x2, i32 2
+ %r3 = insertelement <4 x i32> %r2, i32 %x3, i32 3
+ ret <4 x i32> %r3
+}
+
+define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
+; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsbq (%rdi), %rcx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movsbq 8(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm8
+; SSE2-NEXT: movsbq 12(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm9
+; SSE2-NEXT: movsbq 4(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: movsbq 14(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm10
+; SSE2-NEXT: movsbq 6(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm5
+; SSE2-NEXT: movsbq 10(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm11
+; SSE2-NEXT: movsbq 2(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm7
+; SSE2-NEXT: movsbq 15(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm12
+; SSE2-NEXT: movsbq 7(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: movsbq 11(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm13
+; SSE2-NEXT: movsbq 3(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm6
+; SSE2-NEXT: movsbq 13(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm14
+; SSE2-NEXT: movsbq 5(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm4
+; SSE2-NEXT: movsbq 9(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %ecx
+; SSE2-NEXT: movd %ecx, %xmm15
+; SSE2-NEXT: movsbq 1(%rdi), %rcx
+; SSE2-NEXT: movzbl (%rcx,%rax), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsbq (%rdi), %rcx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: movsbq 8(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm8
+; SSSE3-NEXT: movsbq 12(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm9
+; SSSE3-NEXT: movsbq 4(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: movsbq 14(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm10
+; SSSE3-NEXT: movsbq 6(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm5
+; SSSE3-NEXT: movsbq 10(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm11
+; SSSE3-NEXT: movsbq 2(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm7
+; SSSE3-NEXT: movsbq 15(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm12
+; SSSE3-NEXT: movsbq 7(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: movsbq 11(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm13
+; SSSE3-NEXT: movsbq 3(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm6
+; SSSE3-NEXT: movsbq 13(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm14
+; SSSE3-NEXT: movsbq 5(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm4
+; SSSE3-NEXT: movsbq 9(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm15
+; SSSE3-NEXT: movsbq 1(%rdi), %rcx
+; SSSE3-NEXT: movzbl (%rcx,%rax), %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; SSE41: # BB#0:
+; SSE41-NEXT: pushq %rbp
+; SSE41-NEXT: pushq %r15
+; SSE41-NEXT: pushq %r14
+; SSE41-NEXT: pushq %r13
+; SSE41-NEXT: pushq %r12
+; SSE41-NEXT: pushq %rbx
+; SSE41-NEXT: movsbq (%rdi), %rax
+; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movsbq 1(%rdi), %r15
+; SSE41-NEXT: movsbq 2(%rdi), %r8
+; SSE41-NEXT: movsbq 3(%rdi), %r9
+; SSE41-NEXT: movsbq 4(%rdi), %r10
+; SSE41-NEXT: movsbq 5(%rdi), %r11
+; SSE41-NEXT: movsbq 6(%rdi), %r14
+; SSE41-NEXT: movsbq 7(%rdi), %r12
+; SSE41-NEXT: movsbq 8(%rdi), %r13
+; SSE41-NEXT: movsbq 9(%rdi), %rdx
+; SSE41-NEXT: movsbq 10(%rdi), %rcx
+; SSE41-NEXT: movsbq 11(%rdi), %rsi
+; SSE41-NEXT: movsbq 12(%rdi), %rbx
+; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp
+; SSE41-NEXT: movzbl (%rax,%rbp), %eax
+; SSE41-NEXT: movd %eax, %xmm0
+; SSE41-NEXT: movsbq 13(%rdi), %rax
+; SSE41-NEXT: pinsrb $1, (%r15,%rbp), %xmm0
+; SSE41-NEXT: movsbq 14(%rdi), %r15
+; SSE41-NEXT: movsbq 15(%rdi), %rdi
+; SSE41-NEXT: movzbl (%rdi,%rbp), %edi
+; SSE41-NEXT: movzbl (%r15,%rbp), %r15d
+; SSE41-NEXT: movzbl (%rax,%rbp), %eax
+; SSE41-NEXT: movzbl (%rbx,%rbp), %ebx
+; SSE41-NEXT: movzbl (%rsi,%rbp), %esi
+; SSE41-NEXT: movzbl (%rcx,%rbp), %ecx
+; SSE41-NEXT: movzbl (%rdx,%rbp), %edx
+; SSE41-NEXT: movzbl (%r13,%rbp), %r13d
+; SSE41-NEXT: movzbl (%r12,%rbp), %r12d
+; SSE41-NEXT: movzbl (%r14,%rbp), %r14d
+; SSE41-NEXT: movzbl (%r11,%rbp), %r11d
+; SSE41-NEXT: movzbl (%r10,%rbp), %r10d
+; SSE41-NEXT: movzbl (%r9,%rbp), %r9d
+; SSE41-NEXT: movzbl (%r8,%rbp), %ebp
+; SSE41-NEXT: pinsrb $2, %ebp, %xmm0
+; SSE41-NEXT: pinsrb $3, %r9d, %xmm0
+; SSE41-NEXT: pinsrb $4, %r10d, %xmm0
+; SSE41-NEXT: pinsrb $5, %r11d, %xmm0
+; SSE41-NEXT: pinsrb $6, %r14d, %xmm0
+; SSE41-NEXT: pinsrb $7, %r12d, %xmm0
+; SSE41-NEXT: pinsrb $8, %r13d, %xmm0
+; SSE41-NEXT: pinsrb $9, %edx, %xmm0
+; SSE41-NEXT: pinsrb $10, %ecx, %xmm0
+; SSE41-NEXT: pinsrb $11, %esi, %xmm0
+; SSE41-NEXT: pinsrb $12, %ebx, %xmm0
+; SSE41-NEXT: pinsrb $13, %eax, %xmm0
+; SSE41-NEXT: pinsrb $14, %r15d, %xmm0
+; SSE41-NEXT: pinsrb $15, %edi, %xmm0
+; SSE41-NEXT: popq %rbx
+; SSE41-NEXT: popq %r12
+; SSE41-NEXT: popq %r13
+; SSE41-NEXT: popq %r14
+; SSE41-NEXT: popq %r15
+; SSE41-NEXT: popq %rbp
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: pushq %r15
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %r13
+; AVX-NEXT: pushq %r12
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: movsbq (%rdi), %rsi
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movsbq 1(%rdi), %r15
+; AVX-NEXT: movsbq 2(%rdi), %r8
+; AVX-NEXT: movsbq 3(%rdi), %r9
+; AVX-NEXT: movsbq 4(%rdi), %r10
+; AVX-NEXT: movsbq 5(%rdi), %r11
+; AVX-NEXT: movsbq 6(%rdi), %r14
+; AVX-NEXT: movsbq 7(%rdi), %r12
+; AVX-NEXT: movsbq 8(%rdi), %r13
+; AVX-NEXT: movsbq 9(%rdi), %rdx
+; AVX-NEXT: movsbq 10(%rdi), %rax
+; AVX-NEXT: movsbq 11(%rdi), %rcx
+; AVX-NEXT: movsbq 12(%rdi), %rbx
+; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rbp
+; AVX-NEXT: movzbl (%rsi,%rbp), %esi
+; AVX-NEXT: vmovd %esi, %xmm0
+; AVX-NEXT: movsbq 13(%rdi), %rsi
+; AVX-NEXT: vpinsrb $1, (%r15,%rbp), %xmm0, %xmm0
+; AVX-NEXT: movsbq 14(%rdi), %r15
+; AVX-NEXT: movsbq 15(%rdi), %rdi
+; AVX-NEXT: movzbl (%rdi,%rbp), %edi
+; AVX-NEXT: movzbl (%r15,%rbp), %r15d
+; AVX-NEXT: movzbl (%rsi,%rbp), %esi
+; AVX-NEXT: movzbl (%rbx,%rbp), %ebx
+; AVX-NEXT: movzbl (%rcx,%rbp), %ecx
+; AVX-NEXT: movzbl (%rax,%rbp), %eax
+; AVX-NEXT: movzbl (%rdx,%rbp), %edx
+; AVX-NEXT: movzbl (%r13,%rbp), %r13d
+; AVX-NEXT: movzbl (%r12,%rbp), %r12d
+; AVX-NEXT: movzbl (%r14,%rbp), %r14d
+; AVX-NEXT: movzbl (%r11,%rbp), %r11d
+; AVX-NEXT: movzbl (%r10,%rbp), %r10d
+; AVX-NEXT: movzbl (%r9,%rbp), %r9d
+; AVX-NEXT: movzbl (%r8,%rbp), %ebp
+; AVX-NEXT: vpinsrb $2, %ebp, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $3, %r9d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $5, %r11d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $6, %r14d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $7, %r12d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $8, %r13d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $12, %ebx, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $13, %esi, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $14, %r15d, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r12
+; AVX-NEXT: popq %r13
+; AVX-NEXT: popq %r14
+; AVX-NEXT: popq %r15
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: retq
+ %p0 = getelementptr inbounds i8, i8* %i, i64 0
+ %p1 = getelementptr inbounds i8, i8* %i, i64 1
+ %p2 = getelementptr inbounds i8, i8* %i, i64 2
+ %p3 = getelementptr inbounds i8, i8* %i, i64 3
+ %p4 = getelementptr inbounds i8, i8* %i, i64 4
+ %p5 = getelementptr inbounds i8, i8* %i, i64 5
+ %p6 = getelementptr inbounds i8, i8* %i, i64 6
+ %p7 = getelementptr inbounds i8, i8* %i, i64 7
+ %p8 = getelementptr inbounds i8, i8* %i, i64 8
+ %p9 = getelementptr inbounds i8, i8* %i, i64 9
+ %p10 = getelementptr inbounds i8, i8* %i, i64 10
+ %p11 = getelementptr inbounds i8, i8* %i, i64 11
+ %p12 = getelementptr inbounds i8, i8* %i, i64 12
+ %p13 = getelementptr inbounds i8, i8* %i, i64 13
+ %p14 = getelementptr inbounds i8, i8* %i, i64 14
+ %p15 = getelementptr inbounds i8, i8* %i, i64 15
+ %i0 = load i8, i8* %p0 , align 4
+ %i1 = load i8, i8* %p1 , align 4
+ %i2 = load i8, i8* %p2 , align 4
+ %i3 = load i8, i8* %p3 , align 4
+ %i4 = load i8, i8* %p4 , align 4
+ %i5 = load i8, i8* %p5 , align 4
+ %i6 = load i8, i8* %p6 , align 4
+ %i7 = load i8, i8* %p7 , align 4
+ %i8 = load i8, i8* %p8 , align 4
+ %i9 = load i8, i8* %p9 , align 4
+ %i10 = load i8, i8* %p10, align 4
+ %i11 = load i8, i8* %p11, align 4
+ %i12 = load i8, i8* %p12, align 4
+ %i13 = load i8, i8* %p13, align 4
+ %i14 = load i8, i8* %p14, align 4
+ %i15 = load i8, i8* %p15, align 4
+ %x0 = extractelement <16 x i8> %x, i8 %i0
+ %x1 = extractelement <16 x i8> %x, i8 %i1
+ %x2 = extractelement <16 x i8> %x, i8 %i2
+ %x3 = extractelement <16 x i8> %x, i8 %i3
+ %x4 = extractelement <16 x i8> %x, i8 %i4
+ %x5 = extractelement <16 x i8> %x, i8 %i5
+ %x6 = extractelement <16 x i8> %x, i8 %i6
+ %x7 = extractelement <16 x i8> %x, i8 %i7
+ %x8 = extractelement <16 x i8> %x, i8 %i8
+ %x9 = extractelement <16 x i8> %x, i8 %i9
+ %x10 = extractelement <16 x i8> %x, i8 %i10
+ %x11 = extractelement <16 x i8> %x, i8 %i11
+ %x12 = extractelement <16 x i8> %x, i8 %i12
+ %x13 = extractelement <16 x i8> %x, i8 %i13
+ %x14 = extractelement <16 x i8> %x, i8 %i14
+ %x15 = extractelement <16 x i8> %x, i8 %i15
+ %r0 = insertelement <16 x i8> undef, i8 %x0 , i32 0
+ %r1 = insertelement <16 x i8> %r0 , i8 %x1 , i32 1
+ %r2 = insertelement <16 x i8> %r1 , i8 %x2 , i32 2
+ %r3 = insertelement <16 x i8> %r2 , i8 %x3 , i32 3
+ %r4 = insertelement <16 x i8> %r3 , i8 %x4 , i32 4
+ %r5 = insertelement <16 x i8> %r4 , i8 %x5 , i32 5
+ %r6 = insertelement <16 x i8> %r5 , i8 %x6 , i32 6
+ %r7 = insertelement <16 x i8> %r6 , i8 %x7 , i32 7
+ %r8 = insertelement <16 x i8> %r7 , i8 %x8 , i32 8
+ %r9 = insertelement <16 x i8> %r8 , i8 %x9 , i32 9
+ %r10 = insertelement <16 x i8> %r9 , i8 %x10, i32 10
+ %r11 = insertelement <16 x i8> %r10, i8 %x11, i32 11
+ %r12 = insertelement <16 x i8> %r11, i8 %x12, i32 12
+ %r13 = insertelement <16 x i8> %r12, i8 %x13, i32 13
+ %r14 = insertelement <16 x i8> %r13, i8 %x14, i32 14
+ %r15 = insertelement <16 x i8> %r14, i8 %x15, i32 15
+ ret <16 x i8> %r15
+}
+
+;
+; Binary shuffle indices from registers
+;
+
+define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
+; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
+; SSE: # BB#0:
+; SSE-NEXT: movslq %edi, %rax
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movslq %edx, %rdx
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movslq %ecx, %rcx
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
+; AVX: # BB#0:
+; AVX-NEXT: movslq %edi, %rax
+; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %edx, %rdx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movslq %ecx, %rcx
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
+ %x0 = extractelement <4 x float> %x, i32 %i0
+ %x1 = extractelement <4 x float> %x, i32 %i1
+ %y2 = extractelement <4 x float> %y, i32 %i2
+ %x3 = extractelement <4 x float> %x, i32 %i3
+ %r0 = insertelement <4 x float> undef, float %x0, i32 0
+ %r1 = insertelement <4 x float> %r0, float 0.0, i32 1
+ %r2 = insertelement <4 x float> %r1, float %y2, i32 2
+ %r3 = insertelement <4 x float> %r2, float %x3, i32 3
+ ret <4 x float> %r3
+}
+
+define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %y, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
+; SSE2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; SSE2: # BB#0:
+; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2-NEXT: movswq %di, %r10
+; SSE2-NEXT: movswq %si, %rsi
+; SSE2-NEXT: movswq %dx, %r11
+; SSE2-NEXT: movswq %cx, %rcx
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movswq %r8w, %rdi
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movswq %r9w, %rax
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSE2-NEXT: xorl %edx, %edx
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %esi, %xmm2
+; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movzwl -40(%rsp,%r10,2), %eax
+; SSE2-NEXT: movzwl -40(%rsp,%r11,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3-NEXT: movswq %di, %r10
+; SSSE3-NEXT: movswq %si, %rsi
+; SSSE3-NEXT: movswq %dx, %r11
+; SSSE3-NEXT: movswq %cx, %rcx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movswq %r8w, %rdi
+; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movswq %r9w, %rax
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %esi
+; SSSE3-NEXT: xorl %edx, %edx
+; SSSE3-NEXT: movd %edx, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd %esi, %xmm2
+; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: movzwl -40(%rsp,%r10,2), %eax
+; SSSE3-NEXT: movzwl -40(%rsp,%r11,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; SSE41: # BB#0:
+; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41-NEXT: movswq %di, %rax
+; SSE41-NEXT: movswq %si, %rsi
+; SSE41-NEXT: movswq %dx, %rdx
+; SSE41-NEXT: movswq %cx, %r10
+; SSE41-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movswq %r8w, %rdi
+; SSE41-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE41-NEXT: movswq %r9w, %rcx
+; SSE41-NEXT: movzwl -40(%rsp,%rax,2), %eax
+; SSE41-NEXT: movd %eax, %xmm1
+; SSE41-NEXT: pinsrw $1, -24(%rsp,%rsi,2), %xmm1
+; SSE41-NEXT: pinsrw $2, -40(%rsp,%rdx,2), %xmm1
+; SSE41-NEXT: pinsrw $3, -24(%rsp,%r10,2), %xmm1
+; SSE41-NEXT: pinsrw $4, -40(%rsp,%rdi,2), %xmm1
+; SSE41-NEXT: pinsrw $5, -24(%rsp,%rcx,2), %xmm1
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX1-NEXT: movswq %di, %r10
+; AVX1-NEXT: movswq %si, %r11
+; AVX1-NEXT: movswq %dx, %rdx
+; AVX1-NEXT: movswq %cx, %rcx
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movswq %r8w, %rdi
+; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movswq %r9w, %rax
+; AVX1-NEXT: movzwl -40(%rsp,%r10,2), %esi
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
+; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
+; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX2-NEXT: movswq %di, %r10
+; AVX2-NEXT: movswq %si, %r11
+; AVX2-NEXT: movswq %dx, %rdx
+; AVX2-NEXT: movswq %cx, %rcx
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movswq %r8w, %rdi
+; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movswq %r9w, %rax
+; AVX2-NEXT: movzwl -40(%rsp,%r10,2), %esi
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2-NEXT: retq
+ %x0 = extractelement <8 x i16> %x, i16 %i0
+ %y1 = extractelement <8 x i16> %y, i16 %i1
+ %x2 = extractelement <8 x i16> %x, i16 %i2
+ %y3 = extractelement <8 x i16> %y, i16 %i3
+ %x4 = extractelement <8 x i16> %x, i16 %i4
+ %y5 = extractelement <8 x i16> %y, i16 %i5
+ %x6 = extractelement <8 x i16> %x, i16 %i6
+ %x7 = extractelement <8 x i16> %x, i16 %i7
+ %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+ %r1 = insertelement <8 x i16> %r0, i16 %y1, i32 1
+ %r2 = insertelement <8 x i16> %r1, i16 %x2, i32 2
+ %r3 = insertelement <8 x i16> %r2, i16 %y3, i32 3
+ %r4 = insertelement <8 x i16> %r3, i16 %x4, i32 4
+ %r5 = insertelement <8 x i16> %r4, i16 %y5, i32 5
+ %r6 = insertelement <8 x i16> %r5, i16 0, i32 6
+ %r7 = insertelement <8 x i16> %r6, i16 0, i32 7
+ ret <8 x i16> %r7
+}
diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll
new file mode 100644
index 000000000000..e8d9aa20491b
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -0,0 +1,720 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+;
+; Unary shuffle indices from registers
+;
+
+define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
+ %x0 = extractelement <4 x double> %x, i64 %i0
+ %x1 = extractelement <4 x double> %x, i64 %i1
+ %x2 = extractelement <4 x double> %x, i64 %i2
+ %x3 = extractelement <4 x double> %x, i64 %i3
+ %r0 = insertelement <4 x double> undef, double %x0, i32 0
+ %r1 = insertelement <4 x double> %r0, double %x1, i32 1
+ %r2 = insertelement <4 x double> %r1, double %x2, i32 2
+ %r3 = insertelement <4 x double> %r2, double %x3, i32 3
+ ret <4 x double> %r3
+}
+
+define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
+; ALL: # BB#0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
+ %x0 = extractelement <4 x double> %x, i64 %i0
+ %x1 = extractelement <4 x double> %x, i64 %i1
+ %x2 = extractelement <4 x double> %x, i64 %i2
+ %x3 = extractelement <4 x double> %x, i64 %i3
+ %r0 = insertelement <4 x double> undef, double undef, i32 0
+ %r1 = insertelement <4 x double> %r0, double %x1, i32 1
+ %r2 = insertelement <4 x double> %r1, double %x2, i32 2
+ %r3 = insertelement <4 x double> %r2, double 0.0, i32 3
+ ret <4 x double> %r3
+}
+
+define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
+; ALL: # BB#0:
+; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: retq
+ %x0 = extractelement <2 x double> %x, i64 %i0
+ %x1 = extractelement <2 x double> %x, i64 %i1
+ %x2 = extractelement <2 x double> %x, i64 %i2
+ %x3 = extractelement <2 x double> %x, i64 %i3
+ %r0 = insertelement <4 x double> undef, double %x0, i32 0
+ %r1 = insertelement <4 x double> %r0, double %x1, i32 1
+ %r2 = insertelement <4 x double> %r1, double %x2, i32 2
+ %r3 = insertelement <4 x double> %r2, double %x3, i32 3
+ ret <4 x double> %r3
+}
+
+define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %x0 = extractelement <4 x i64> %x, i64 %i0
+ %x1 = extractelement <4 x i64> %x, i64 %i1
+ %x2 = extractelement <4 x i64> %x, i64 %i2
+ %x3 = extractelement <4 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
+
+define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %x0 = extractelement <4 x i64> %x, i64 %i0
+ %x1 = extractelement <4 x i64> %x, i64 %i1
+ %x2 = extractelement <4 x i64> %x, i64 %i2
+ %x3 = extractelement <4 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 0, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 0, i32 3
+ ret <4 x i64> %r3
+}
+
+define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
+; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <2 x i64> %x, i64 %i0
+ %x1 = extractelement <2 x i64> %x, i64 %i1
+ %x2 = extractelement <2 x i64> %x, i64 %i2
+ %x3 = extractelement <2 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
+
+define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
+; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: movslq %edi, %rax
+; AVX1-NEXT: movslq %esi, %rsi
+; AVX1-NEXT: movslq %edx, %rdx
+; AVX1-NEXT: movslq %ecx, %r11
+; AVX1-NEXT: movslq %r8d, %r10
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: movslq %r9d, %r8
+; AVX1-NEXT: movslq 16(%rbp), %rdi
+; AVX1-NEXT: movslq 24(%rbp), %rcx
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovd %edi, %xmm1
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vmovd %esi, %xmm2
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; AVX2-NEXT: vmovd %edx, %xmm3
+; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3
+; AVX2-NEXT: vmovd %ecx, %xmm4
+; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4
+; AVX2-NEXT: vmovd %r8d, %xmm5
+; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5
+; AVX2-NEXT: vmovd %r9d, %xmm6
+; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6
+; AVX2-NEXT: vmovd {{.*#+}} xmm7 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7
+; AVX2-NEXT: vmovd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <8 x float> %x, i32 %i0
+ %x1 = extractelement <8 x float> %x, i32 %i1
+ %x2 = extractelement <8 x float> %x, i32 %i2
+ %x3 = extractelement <8 x float> %x, i32 %i3
+ %x4 = extractelement <8 x float> %x, i32 %i4
+ %x5 = extractelement <8 x float> %x, i32 %i5
+ %x6 = extractelement <8 x float> %x, i32 %i6
+ %x7 = extractelement <8 x float> %x, i32 %i7
+ %r0 = insertelement <8 x float> undef, float %x0, i32 0
+ %r1 = insertelement <8 x float> %r0, float %x1, i32 1
+ %r2 = insertelement <8 x float> %r1, float %x2, i32 2
+ %r3 = insertelement <8 x float> %r2, float %x3, i32 3
+ %r4 = insertelement <8 x float> %r3, float %x4, i32 4
+ %r5 = insertelement <8 x float> %r4, float %x5, i32 5
+ %r6 = insertelement <8 x float> %r5, float %x6, i32 6
+ %r7 = insertelement <8 x float> %r6, float %x7, i32 7
+ ret <8 x float> %r7
+}
+
+define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
+; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
+; ALL: # BB#0:
+; ALL-NEXT: movslq %edi, %rax
+; ALL-NEXT: movslq %esi, %rsi
+; ALL-NEXT: movslq %edx, %rdx
+; ALL-NEXT: movslq %ecx, %r11
+; ALL-NEXT: movslq %r8d, %r10
+; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: movslq %r9d, %r8
+; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rdi
+; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rcx
+; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT: retq
+ %x0 = extractelement <4 x float> %x, i32 %i0
+ %x1 = extractelement <4 x float> %x, i32 %i1
+ %x2 = extractelement <4 x float> %x, i32 %i2
+ %x3 = extractelement <4 x float> %x, i32 %i3
+ %x4 = extractelement <4 x float> %x, i32 %i4
+ %x5 = extractelement <4 x float> %x, i32 %i5
+ %x6 = extractelement <4 x float> %x, i32 %i6
+ %x7 = extractelement <4 x float> %x, i32 %i7
+ %r0 = insertelement <8 x float> undef, float %x0, i32 0
+ %r1 = insertelement <8 x float> %r0, float %x1, i32 1
+ %r2 = insertelement <8 x float> %r1, float %x2, i32 2
+ %r3 = insertelement <8 x float> %r2, float %x3, i32 3
+ %r4 = insertelement <8 x float> %r3, float %x4, i32 4
+ %r5 = insertelement <8 x float> %r4, float %x5, i32 5
+ %r6 = insertelement <8 x float> %r5, float %x6, i32 6
+ %r7 = insertelement <8 x float> %r6, float %x7, i32 7
+ ret <8 x float> %r7
+}
+
+define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
+; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: movslq 32(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: movslq 40(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 48(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 56(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 64(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 72(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 80(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq 88(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq %edi, %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movslq %esi, %rax
+; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %edx, %rax
+; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %ecx, %rax
+; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r8d, %rax
+; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r9d, %rax
+; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq 16(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movslq 24(%rbp), %rax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: movslq 32(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: movslq 40(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 48(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 56(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 64(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 72(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 80(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq 88(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq %edi, %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movslq %esi, %rax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %edx, %rax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %ecx, %rax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r8d, %rax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r9d, %rax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq 16(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movslq 24(%rbp), %rax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %x0 = extractelement <16 x i16> %x, i32 %i0
+ %x1 = extractelement <16 x i16> %x, i32 %i1
+ %x2 = extractelement <16 x i16> %x, i32 %i2
+ %x3 = extractelement <16 x i16> %x, i32 %i3
+ %x4 = extractelement <16 x i16> %x, i32 %i4
+ %x5 = extractelement <16 x i16> %x, i32 %i5
+ %x6 = extractelement <16 x i16> %x, i32 %i6
+ %x7 = extractelement <16 x i16> %x, i32 %i7
+ %x8 = extractelement <16 x i16> %x, i32 %i8
+ %x9 = extractelement <16 x i16> %x, i32 %i9
+ %x10 = extractelement <16 x i16> %x, i32 %i10
+ %x11 = extractelement <16 x i16> %x, i32 %i11
+ %x12 = extractelement <16 x i16> %x, i32 %i12
+ %x13 = extractelement <16 x i16> %x, i32 %i13
+ %x14 = extractelement <16 x i16> %x, i32 %i14
+ %x15 = extractelement <16 x i16> %x, i32 %i15
+ %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
+ %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
+ %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
+ %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
+ %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
+ %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
+ %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
+ %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
+ %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
+ %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
+ %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
+ %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
+ %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
+ %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
+ %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
+ %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
+ ret <16 x i16> %r15
+}
+
+define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
+; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: movslq %edi, %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: movslq %esi, %rax
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %edx, %rax
+; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %ecx, %rax
+; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r8d, %rax
+; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq %r9d, %rax
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: movslq %edi, %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: movslq %esi, %rax
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %edx, %rax
+; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %ecx, %rax
+; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r8d, %rax
+; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq %r9d, %rax
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+ %x0 = extractelement <8 x i16> %x, i32 %i0
+ %x1 = extractelement <8 x i16> %x, i32 %i1
+ %x2 = extractelement <8 x i16> %x, i32 %i2
+ %x3 = extractelement <8 x i16> %x, i32 %i3
+ %x4 = extractelement <8 x i16> %x, i32 %i4
+ %x5 = extractelement <8 x i16> %x, i32 %i5
+ %x6 = extractelement <8 x i16> %x, i32 %i6
+ %x7 = extractelement <8 x i16> %x, i32 %i7
+ %x8 = extractelement <8 x i16> %x, i32 %i8
+ %x9 = extractelement <8 x i16> %x, i32 %i9
+ %x10 = extractelement <8 x i16> %x, i32 %i10
+ %x11 = extractelement <8 x i16> %x, i32 %i11
+ %x12 = extractelement <8 x i16> %x, i32 %i12
+ %x13 = extractelement <8 x i16> %x, i32 %i13
+ %x14 = extractelement <8 x i16> %x, i32 %i14
+ %x15 = extractelement <8 x i16> %x, i32 %i15
+ %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
+ %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
+ %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
+ %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
+ %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
+ %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
+ %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
+ %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
+ %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
+ %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
+ %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
+ %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
+ %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
+ %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
+ %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
+ %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
+ ret <16 x i16> %r15
+}
+
+;
+; Unary shuffle indices from memory
+;
+
+define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
+; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: movq 8(%rdi), %rcx
+; AVX1-NEXT: movq 16(%rdi), %rdx
+; AVX1-NEXT: movq 24(%rdi), %rsi
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: movq 24(%rdi), %rsi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+ %p0 = getelementptr inbounds i64, i64* %i, i32 0
+ %p1 = getelementptr inbounds i64, i64* %i, i32 1
+ %p2 = getelementptr inbounds i64, i64* %i, i32 2
+ %p3 = getelementptr inbounds i64, i64* %i, i32 3
+ %i0 = load i64, i64* %p0, align 4
+ %i1 = load i64, i64* %p1, align 4
+ %i2 = load i64, i64* %p2, align 4
+ %i3 = load i64, i64* %p3, align 4
+ %x0 = extractelement <4 x i64> %x, i64 %i0
+ %x1 = extractelement <4 x i64> %x, i64 %i1
+ %x2 = extractelement <4 x i64> %x, i64 %i2
+ %x3 = extractelement <4 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
+
+define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
+; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: movq 8(%rdi), %rcx
+; AVX1-NEXT: movq 16(%rdi), %rdx
+; AVX1-NEXT: movq 24(%rdi), %rsi
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: movq 24(%rdi), %rsi
+; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %p0 = getelementptr inbounds i64, i64* %i, i32 0
+ %p1 = getelementptr inbounds i64, i64* %i, i32 1
+ %p2 = getelementptr inbounds i64, i64* %i, i32 2
+ %p3 = getelementptr inbounds i64, i64* %i, i32 3
+ %i0 = load i64, i64* %p0, align 4
+ %i1 = load i64, i64* %p1, align 4
+ %i2 = load i64, i64* %p2, align 4
+ %i3 = load i64, i64* %p3, align 4
+ %x0 = extractelement <2 x i64> %x, i64 %i0
+ %x1 = extractelement <2 x i64> %x, i64 %i1
+ %x2 = extractelement <2 x i64> %x, i64 %i2
+ %x3 = extractelement <2 x i64> %x, i64 %i3
+ %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+ %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
+ %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
+ %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
+ ret <4 x i64> %r3
+}
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
new file mode 100644
index 000000000000..a7794afba3d1
--- /dev/null
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -0,0 +1,5315 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; add
+;
+
+define <4 x i32> @trunc_add_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm2, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_add_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddq %xmm6, %xmm2
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: paddq %xmm7, %xmm3
+; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_add_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_add_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddd %xmm4, %xmm0
+; SSE-NEXT: paddd %xmm5, %xmm1
+; SSE-NEXT: paddd %xmm6, %xmm2
+; SSE-NEXT: paddd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_add_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddw %xmm2, %xmm0
+; SSE-NEXT: paddw %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_add_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = add <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; add to constant
+;
+
+define <4 x i32> @trunc_add_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: paddq %xmm0, %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_add_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_add_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: paddq %xmm8, %xmm0
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm4
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm5
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm6
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_add_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; sub
+;
+
+define <4 x i32> @trunc_sub_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: psubq %xmm2, %xmm0
+; SSE-NEXT: psubq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_sub_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: psubq %xmm6, %xmm2
+; SSE-NEXT: psubq %xmm4, %xmm0
+; SSE-NEXT: psubq %xmm7, %xmm3
+; SSE-NEXT: psubq %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_sub_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: psubd %xmm2, %xmm0
+; SSE-NEXT: psubd %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpackuswb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubd %xmm4, %xmm0
+; SSE-NEXT: psubd %xmm5, %xmm1
+; SSE-NEXT: psubd %xmm6, %xmm2
+; SSE-NEXT: psubd %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_sub_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubw %xmm2, %xmm0
+; SSE-NEXT: psubw %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = sub <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; sub to constant
+;
+
+define <4 x i32> @trunc_sub_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: psubq %xmm2, %xmm0
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_sub_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm4
+; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; SSE-NEXT: psubq %xmm4, %xmm0
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_sub_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: psubq %xmm8, %xmm0
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm3
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm4
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm5
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm6
+; SSE-NEXT: psubq {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
+; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm2
+; SSE-NEXT: psubd {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
+; SSE-NEXT: psubw {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; mul
+;
+
+define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm0, %xmm5
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm1, %xmm4
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: paddq %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm2, %xmm8
+; SSE-NEXT: pmuludq %xmm6, %xmm8
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: psrlq $32, %xmm9
+; SSE-NEXT: pmuludq %xmm2, %xmm9
+; SSE-NEXT: psllq $32, %xmm9
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm6, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm9, %xmm2
+; SSE-NEXT: paddq %xmm8, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm8
+; SSE-NEXT: pmuludq %xmm4, %xmm8
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm0, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm6, %xmm0
+; SSE-NEXT: paddq %xmm8, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pmuludq %xmm7, %xmm4
+; SSE-NEXT: movdqa %xmm7, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm3, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm7, %xmm3
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm6, %xmm3
+; SSE-NEXT: paddq %xmm4, %xmm3
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm1, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm5, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm6, %xmm1
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
+; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6
+; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
+; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
+; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
+; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_mul_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm0, %xmm10
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm8, %xmm0
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm10, %xmm0
+; SSE-NEXT: paddq %xmm9, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm1, %xmm10
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm8, %xmm1
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm10, %xmm1
+; SSE-NEXT: paddq %xmm9, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm2, %xmm10
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm8, %xmm2
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm10, %xmm2
+; SSE-NEXT: paddq %xmm9, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm3, %xmm10
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm8, %xmm3
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm10, %xmm3
+; SSE-NEXT: paddq %xmm9, %xmm3
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm4, %xmm10
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm8, %xmm4
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: paddq %xmm10, %xmm4
+; SSE-NEXT: paddq %xmm9, %xmm4
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm5, %xmm10
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm8, %xmm5
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: paddq %xmm10, %xmm5
+; SSE-NEXT: paddq %xmm9, %xmm5
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm6, %xmm10
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm8, %xmm6
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: paddq %xmm10, %xmm6
+; SSE-NEXT: paddq %xmm9, %xmm6
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: movdqa %xmm8, %xmm10
+; SSE-NEXT: psrlq $32, %xmm10
+; SSE-NEXT: pmuludq %xmm7, %xmm10
+; SSE-NEXT: psrlq $32, %xmm7
+; SSE-NEXT: pmuludq %xmm8, %xmm7
+; SSE-NEXT: psllq $32, %xmm10
+; SSE-NEXT: psllq $32, %xmm7
+; SSE-NEXT: paddq %xmm10, %xmm7
+; SSE-NEXT: paddq %xmm9, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
+; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9
+; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm10
+; AVX1-NEXT: vpmuludq %xmm4, %xmm10, %xmm10
+; AVX1-NEXT: vpsllq $32, %xmm10, %xmm10
+; AVX1-NEXT: vpaddq %xmm10, %xmm9, %xmm9
+; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm10
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm9
+; AVX1-NEXT: vpsrlq $32, %xmm10, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm9, %xmm9
+; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm10
+; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0
+; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm10, %xmm10
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm1
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
+; AVX1-NEXT: vpmuludq %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
+; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
+; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
+; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7
+; AVX1-NEXT: vpsllq $32, %xmm7, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
+; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm10, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm9, %xmm2
+; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm8
+; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9
+; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9
+; AVX2-NEXT: vpsllq $32, %ymm9, %ymm9
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm9, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8
+; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8
+; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5
+; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5
+; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm4
+; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5
+; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5
+; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
+; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm5
+; AVX512-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
+; AVX512-NEXT: vpsllq $32, %zmm5, %zmm5
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpaddq %zmm1, %zmm5, %zmm1
+; AVX512-NEXT: vpaddq %zmm1, %zmm4, %zmm1
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm2, %zmm4
+; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
+; AVX512-NEXT: vpsllq $32, %zmm4, %zmm4
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm8, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm6, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm7, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_mul_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = mul <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; mul to constant
+;
+
+define <4 x i32> @trunc_mul_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm2, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
+; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_mul_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5]
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm5, %xmm2
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm4
+; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm5, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: pmuludq %xmm4, %xmm5
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm4, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
+; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
+; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_mul_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm8, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm9, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3]
+; SSE-NEXT: movdqa %xmm1, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm8, %xmm1
+; SSE-NEXT: psllq $32, %xmm1
+; SSE-NEXT: paddq %xmm9, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5]
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm8, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm9, %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm8, %xmm3
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: paddq %xmm9, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9]
+; SSE-NEXT: movdqa %xmm4, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm8, %xmm4
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: paddq %xmm9, %xmm4
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11]
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm8, %xmm5
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: paddq %xmm9, %xmm5
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13]
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm6
+; SSE-NEXT: pmuludq %xmm8, %xmm6
+; SSE-NEXT: psllq $32, %xmm6
+; SSE-NEXT: paddq %xmm9, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15]
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: pmuludq %xmm8, %xmm9
+; SSE-NEXT: psrlq $32, %xmm7
+; SSE-NEXT: pmuludq %xmm8, %xmm7
+; SSE-NEXT: psllq $32, %xmm7
+; SSE-NEXT: paddq %xmm9, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: movl $1, %eax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
+; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3]
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm9
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5]
+; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm6
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7
+; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
+; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [12,13]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm0
+; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [14,15]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm2
+; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11]
+; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpmuludq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1
+; AVX512-NEXT: vpaddq %zmm1, %zmm3, %zmm1
+; AVX512-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm4, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; and
+;
+
+define <4 x i32> @trunc_and_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_and_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm6, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pand %xmm7, %xmm3
+; SSE-NEXT: pand %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_and_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_and_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm3, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: pand %xmm2, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm1, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm5, %xmm0
+; SSE-NEXT: packuswb %xmm6, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_and_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_and_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = and <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; and to constant
+;
+
+define <4 x i32> @trunc_and_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm0, %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_and_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_and_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE-NEXT: pand {{.*}}(%rip), %xmm4
+; SSE-NEXT: pand {{.*}}(%rip), %xmm5
+; SSE-NEXT: pand {{.*}}(%rip), %xmm6
+; SSE-NEXT: pand {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm9, %xmm7
+; SSE-NEXT: pand %xmm9, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm9, %xmm5
+; SSE-NEXT: pand %xmm9, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm9, %xmm3
+; SSE-NEXT: pand %xmm9, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm9, %xmm1
+; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: pand {{.*}}(%rip), %xmm2
+; SSE-NEXT: pand {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_and_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; xor
+;
+
+define <4 x i32> @trunc_xor_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_xor_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_xor_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE-NEXT: pxor %xmm6, %xmm2
+; SSE-NEXT: pxor %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_xor_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = xor <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; xor to constant
+;
+
+define <4 x i32> @trunc_xor_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm0, %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_xor_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_xor_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: pxor %xmm8, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm4
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm5
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm6
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; or
+;
+
+define <4 x i32> @trunc_or_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <4 x i64> %a0, %a1
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_or_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v8i64_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movd %ecx, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v8i64_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v8i64_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v8i64_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <8 x i64> %a0, %a1
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_or_v8i32_8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_or_v8i32_8i16:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v8i32_8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v8i32_8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v8i32_8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <8 x i32> %a0, %a1
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i64> %a0, %a1
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i32> %a0, %a1
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
+; SSE-LABEL: trunc_or_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_or_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = or <16 x i16> %a0, %a1
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; or to constant
+;
+
+define <4 x i32> @trunc_or_const_v4i64_4i32(<4 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v4i64_4i32:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v4i64_4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v4i64_4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v4i64_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_or_const_v16i64_v16i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i64_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm3
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: pextrw $4, %xmm1, %eax
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: pextrw $4, %xmm0, %ecx
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: pextrw $4, %xmm3, %edx
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: movd %eax, %xmm3
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE-NEXT: movd %ecx, %xmm1
+; SSE-NEXT: pextrw $4, %xmm2, %eax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i64_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i64_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i64_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+ %2 = trunc <8 x i64> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @trunc_or_const_v16i32_v16i16(<8 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i32_v16i16:
+; SSE: # BB#0:
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: pslld $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: pslld $16, %xmm0
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i32_v16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i32_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i32_v16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = trunc <8 x i32> %1 to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i64_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: movd %rax, %xmm8
+; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
+; SSE-NEXT: por %xmm8, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: por {{.*}}(%rip), %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm3
+; SSE-NEXT: por {{.*}}(%rip), %xmm4
+; SSE-NEXT: por {{.*}}(%rip), %xmm5
+; SSE-NEXT: por {{.*}}(%rip), %xmm6
+; SSE-NEXT: por {{.*}}(%rip), %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: packuswb %xmm7, %xmm6
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
+; SSE-NEXT: packuswb %xmm5, %xmm4
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+ %2 = trunc <16 x i64> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i32_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: por {{.*}}(%rip), %xmm2
+; SSE-NEXT: por {{.*}}(%rip), %xmm3
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE-NEXT: pand %xmm4, %xmm3
+; SSE-NEXT: pand %xmm4, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = trunc <16 x i32> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
+; SSE-LABEL: trunc_or_const_v16i16_v16i8:
+; SSE: # BB#0:
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
+ %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+ %2 = trunc <16 x i16> %1 to <16 x i8>
+ ret <16 x i8> %2
+}
+
+;
+; complex patterns - often created by vectorizer
+;
+
+define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
+; SSE-LABEL: mul_add_v4i64_v4i32:
+; SSE: # BB#0:
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: pmuludq %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm5
+; SSE-NEXT: psrlq $32, %xmm5
+; SSE-NEXT: pmuludq %xmm0, %xmm5
+; SSE-NEXT: psllq $32, %xmm5
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: psllq $32, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm0
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrlq $32, %xmm4
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: psllq $32, %xmm4
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: psllq $32, %xmm2
+; SSE-NEXT: paddq %xmm4, %xmm2
+; SSE-NEXT: paddq %xmm1, %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; SSE-NEXT: paddq {{.*}}(%rip), %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: mul_add_v4i64_v4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
+; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
+; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
+; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
+; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: mul_add_v4i64_v4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_add_v4i64_v4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX512-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
+; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
+; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: retq
+ %1 = sext <4 x i32> %a0 to <4 x i64>
+ %2 = sext <4 x i32> %a1 to <4 x i64>
+ %3 = mul <4 x i64> %1, %2
+ %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
+ %5 = trunc <4 x i64> %4 to <4 x i32>
+ ret <4 x i32> %5
+}
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 8c02c5a5433f..cfeb41e891d6 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -52,9 +52,10 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i32:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -135,12 +136,14 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i16:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -187,9 +190,10 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
;
; AVX2-LABEL: trunc8i64_8i8:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -248,12 +252,15 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i16:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512BW-NEXT: retq
entry:
%0 = trunc <8 x i32> %a to <8 x i16>
@@ -311,6 +318,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
;
; AVX512BW-LABEL: trunc8i32_8i8:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
@@ -422,14 +430,17 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: trunc2x4i64_8i32:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i32:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -517,9 +528,10 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: trunc2x4i64_8i16:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -529,6 +541,8 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512BW-LABEL: trunc2x4i64_8i16:
; AVX512BW: # BB#0: # %entry
+; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index f1714d4845de..c9ad6e40d1c2 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -5,6 +5,11 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
+;
+; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
@@ -82,6 +87,32 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pushl %esi
+; X32-SSE-NEXT: pextrd $3, %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %eax
+; X32-SSE-NEXT: movl $32, %ecx
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: pextrd $2, %xmm0, %edx
+; X32-SSE-NEXT: bsfl %edx, %esi
+; X32-SSE-NEXT: testl %edx, %edx
+; X32-SSE-NEXT: cmovel %eax, %esi
+; X32-SSE-NEXT: movd %esi, %xmm1
+; X32-SSE-NEXT: pextrd $1, %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %eax
+; X32-SSE-NEXT: cmovel %ecx, %eax
+; X32-SSE-NEXT: addl $32, %eax
+; X32-SSE-NEXT: movd %xmm0, %ecx
+; X32-SSE-NEXT: bsfl %ecx, %edx
+; X32-SSE-NEXT: testl %ecx, %ecx
+; X32-SSE-NEXT: cmovel %eax, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
ret <2 x i64> %out
}
@@ -137,16 +168,68 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv2i64u:
-; AVX: # BB#0:
-; AVX-NEXT: vpextrq $1, %xmm0, %rax
-; AVX-NEXT: bsfq %rax, %rax
-; AVX-NEXT: vmovq %rax, %xmm1
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: bsfq %rax, %rax
-; AVX-NEXT: vmovq %rax, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: retq
+; AVX1-LABEL: testv2i64u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: bsfq %rax, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: bsfq %rax, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv2i64u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: bsfq %rax, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: bsfq %rax, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv2i64u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [63,63]
+; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv2i64u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pextrd $2, %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %ecx
+; X32-SSE-NEXT: pextrd $3, %xmm0, %edx
+; X32-SSE-NEXT: bsfl %edx, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm1
+; X32-SSE-NEXT: movd %xmm0, %eax
+; X32-SSE-NEXT: bsfl %eax, %ecx
+; X32-SSE-NEXT: pextrd $1, %xmm0, %edx
+; X32-SSE-NEXT: bsfl %edx, %edx
+; X32-SSE-NEXT: addl $32, %edx
+; X32-SSE-NEXT: testl %eax, %eax
+; X32-SSE-NEXT: cmovnel %ecx, %edx
+; X32-SSE-NEXT: movd %edx, %xmm0
+; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
ret <2 x i64> %out
}
@@ -302,6 +385,74 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; AVX512CDVL-NEXT: vpandd %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm3
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512CD-NEXT: vpsubd %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: psubd %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pand %xmm3, %xmm4
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm5
+; X32-SSE-NEXT: pshufb %xmm4, %xmm5
+; X32-SSE-NEXT: psrlw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm3, %xmm2
+; X32-SSE-NEXT: pshufb %xmm2, %xmm0
+; X32-SSE-NEXT: paddb %xmm5, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm2
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
ret <4 x i32> %out
}
@@ -457,6 +608,51 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandd %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: pxor %xmm2, %xmm2
+; X32-SSE-NEXT: psubd %xmm0, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm2, %xmm4
+; X32-SSE-NEXT: pand %xmm3, %xmm4
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm5
+; X32-SSE-NEXT: pshufb %xmm4, %xmm5
+; X32-SSE-NEXT: psrlw $4, %xmm2
+; X32-SSE-NEXT: pand %xmm3, %xmm2
+; X32-SSE-NEXT: pshufb %xmm2, %xmm0
+; X32-SSE-NEXT: paddb %xmm5, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm2
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT: psadbw %xmm1, %xmm0
+; X32-SSE-NEXT: packuswb %xmm2, %xmm0
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
ret <4 x i32> %out
}
@@ -558,24 +754,103 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubw %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm2, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: paddb %xmm4, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm0
+; X32-SSE-NEXT: psllw $8, %xmm0
+; X32-SSE-NEXT: paddb %xmm3, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0)
ret <8 x i16> %out
}
@@ -677,24 +952,103 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv8i16u:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv8i16u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv8i16u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1
+; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubw %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm2
+; X32-SSE-NEXT: pand %xmm0, %xmm2
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm3, %xmm4
+; X32-SSE-NEXT: pshufb %xmm2, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm3
+; X32-SSE-NEXT: paddb %xmm4, %xmm3
+; X32-SSE-NEXT: movdqa %xmm3, %xmm0
+; X32-SSE-NEXT: psllw $8, %xmm0
+; X32-SSE-NEXT: paddb %xmm3, %xmm0
+; X32-SSE-NEXT: psrlw $8, %xmm0
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1)
ret <8 x i16> %out
}
@@ -780,21 +1134,87 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: paddb %xmm4, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubb %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm0
+; X32-SSE-NEXT: paddb %xmm4, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
ret <16 x i8> %out
}
@@ -880,21 +1300,87 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE41-NEXT: paddb %xmm4, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: testv16i8u:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: testv16i8u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: testv16i8u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: testv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pxor %xmm1, %xmm1
+; X32-SSE-NEXT: psubb %xmm0, %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT: movdqa %xmm1, %xmm3
+; X32-SSE-NEXT: pand %xmm2, %xmm3
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: pshufb %xmm3, %xmm4
+; X32-SSE-NEXT: psrlw $4, %xmm1
+; X32-SSE-NEXT: pand %xmm2, %xmm1
+; X32-SSE-NEXT: pshufb %xmm1, %xmm0
+; X32-SSE-NEXT: paddb %xmm4, %xmm0
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
ret <16 x i8> %out
}
@@ -911,6 +1397,12 @@ define <2 x i64> @foldv2i64() nounwind {
; AVX-NEXT: movl $8, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $8, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
ret <2 x i64> %out
}
@@ -927,6 +1419,12 @@ define <2 x i64> @foldv2i64u() nounwind {
; AVX-NEXT: movl $8, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
+;
+; X32-SSE-LABEL: foldv2i64u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl $8, %eax
+; X32-SSE-NEXT: movd %eax, %xmm0
+; X32-SSE-NEXT: retl
%out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
ret <2 x i64> %out
}
@@ -937,10 +1435,30 @@ define <4 x i32> @foldv4i32() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv4i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
ret <4 x i32> %out
}
@@ -951,10 +1469,30 @@ define <4 x i32> @foldv4i32u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv4i32u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv4i32u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i32u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv4i32u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
+; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
ret <4 x i32> %out
}
@@ -965,10 +1503,30 @@ define <8 x i16> @foldv8i16() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv8i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
ret <8 x i16> %out
}
@@ -979,10 +1537,30 @@ define <8 x i16> @foldv8i16u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv8i16u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv8i16u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i16u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv8i16u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
ret <8 x i16> %out
}
@@ -993,10 +1571,30 @@ define <16 x i8> @foldv16i8() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv16i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
ret <16 x i8> %out
}
@@ -1007,10 +1605,30 @@ define <16 x i8> @foldv16i8u() nounwind {
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; SSE-NEXT: retq
;
-; AVX-LABEL: foldv16i8u:
-; AVX: # BB#0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT: retq
+; AVX1-LABEL: foldv16i8u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i8u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
+;
+; X32-SSE-LABEL: foldv16i8u:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
ret <16 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index a9ee27cc51f0..286bc50ec723 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -1,6 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
@@ -51,6 +53,41 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i64:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i64:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512CD-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
ret <4 x i64> %out
}
@@ -104,6 +141,26 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv4i64u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv4i64u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
ret <4 x i64> %out
}
@@ -169,6 +226,49 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; AVX512CDVL-NEXT: vpandd %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX512CD-NEXT: vpsubd %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
ret <8 x i32> %out
}
@@ -234,6 +334,26 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv8i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandd %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv8i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
ret <8 x i32> %out
}
@@ -292,6 +412,44 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
ret <16 x i16> %out
}
@@ -350,6 +508,44 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv16i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv16i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
ret <16 x i16> %out
}
@@ -399,6 +595,38 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv32i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv32i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
ret <32 x i8> %out
}
@@ -448,78 +676,230 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: testv32i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: testv32i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
ret <32 x i8> %out
}
define <4 x i64> @foldv4i64() nounwind {
-; ALL-LABEL: foldv4i64:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv4i64:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i64:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i64:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
ret <4 x i64> %out
}
define <4 x i64> @foldv4i64u() nounwind {
-; ALL-LABEL: foldv4i64u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv4i64u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv4i64u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv4i64u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv4i64u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; AVX512CD-NEXT: retq
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
ret <4 x i64> %out
}
define <8 x i32> @foldv8i32() nounwind {
-; ALL-LABEL: foldv8i32:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv8i32:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i32:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i32:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i32:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
ret <8 x i32> %out
}
define <8 x i32> @foldv8i32u() nounwind {
-; ALL-LABEL: foldv8i32u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv8i32u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv8i32u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv8i32u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv8i32u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; AVX512CD-NEXT: retq
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
ret <8 x i32> %out
}
define <16 x i16> @foldv16i16() nounwind {
-; ALL-LABEL: foldv16i16:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv16i16:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i16:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i16:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
ret <16 x i16> %out
}
define <16 x i16> @foldv16i16u() nounwind {
-; ALL-LABEL: foldv16i16u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv16i16u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv16i16u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv16i16u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv16i16u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512CD-NEXT: retq
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
ret <16 x i16> %out
}
define <32 x i8> @foldv32i8() nounwind {
-; ALL-LABEL: foldv32i8:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv32i8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv32i8:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv32i8:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
ret <32 x i8> %out
}
define <32 x i8> @foldv32i8u() nounwind {
-; ALL-LABEL: foldv32i8u:
-; ALL: # BB#0:
-; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; ALL-NEXT: retq
+; AVX1-LABEL: foldv32i8u:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: foldv32i8u:
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX2-NEXT: retq
+;
+; AVX512CDVL-LABEL: foldv32i8u:
+; AVX512CDVL: # BB#0:
+; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CDVL-NEXT: retq
+;
+; AVX512CD-LABEL: foldv32i8u:
+; AVX512CD: # BB#0:
+; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; AVX512CD-NEXT: retq
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
ret <32 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 9265fad0176c..81bfd8189b8f 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -1,266 +1,509 @@
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
-; ALL-LABEL: testv8i64:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrq $1, %xmm1, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vmovq %xmm1, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm1
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrq $1, %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm2, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm2
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; ALL-NEXT: vpextrq $1, %xmm0, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm3
-; ALL-NEXT: vmovq %xmm0, %rax
-; ALL-NEXT: tzcntq %rax, %rax
-; ALL-NEXT: vmovq %rax, %xmm0
-; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv8i64:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv8i64:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv8i64:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
ret <8 x i64> %out
}
define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
-; ALL-LABEL: testv8i64u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: vpsubq %zmm0, %zmm1, %zmm1
-; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vplzcntq %zmm0, %zmm0
-; ALL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpsubq %zmm0, %zmm1, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv8i64u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv8i64u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv8i64u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
ret <8 x i64> %out
}
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
-; ALL-LABEL: testv16i32:
-; ALL: ## BB#0:
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; ALL-NEXT: vpextrd $1, %xmm1, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm1, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm2
-; ALL-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $2, %xmm1, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; ALL-NEXT: vpextrd $3, %xmm1, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm2, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm2, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; ALL-NEXT: vpextrd $1, %xmm0, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vmovd %xmm0, %ecx
-; ALL-NEXT: tzcntl %ecx, %ecx
-; ALL-NEXT: vmovd %ecx, %xmm3
-; ALL-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $2, %xmm0, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; ALL-NEXT: vpextrd $3, %xmm0, %eax
-; ALL-NEXT: tzcntl %eax, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv16i32:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm2, %ymm2
+; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
+; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv16i32:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512CDBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv16i32:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
ret <16 x i32> %out
}
define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
-; ALL-LABEL: testv16i32u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: vpsubd %zmm0, %zmm1, %zmm1
-; ALL-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vplzcntd %zmm0, %zmm0
-; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpsubd %zmm0, %zmm1, %zmm0
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv16i32u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
+; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv16i32u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1
+; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv16i32u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
ret <16 x i32> %out
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubw %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsllw $8, %ymm0, %ymm5
-; ALL-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; ALL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubw %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
-; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv32i16:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512CDBW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv32i16:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
ret <32 x i16> %out
}
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
-; ALL-LABEL: testv32i16u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubw %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsllw $8, %ymm0, %ymm5
-; ALL-NEXT: vpaddb %ymm0, %ymm5, %ymm0
-; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; ALL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubw %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
-; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv32i16u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv32i16u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512CDBW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512CDBW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv32i16u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubb %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv64i8:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv64i8:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
ret <64 x i8> %out
}
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
-; ALL-LABEL: testv64i8u:
-; ALL: ## BB#0:
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; ALL-NEXT: vpsubb %ymm0, %ymm2, %ymm3
-; ALL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; ALL-NEXT: vpsubb %ymm3, %ymm0, %ymm0
-; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm5
-; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; ALL-NEXT: vpshufb %ymm5, %ymm6, %ymm5
-; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; ALL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; ALL-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; ALL-NEXT: vpaddb %ymm5, %ymm0, %ymm0
-; ALL-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
-; ALL-NEXT: vpsubb %ymm3, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm2
-; ALL-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
-; ALL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; ALL-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; ALL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; ALL-NEXT: retq
+; AVX512CD-LABEL: testv64i8u:
+; AVX512CD: ## BB#0:
+; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsubb %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: retq
+;
+; AVX512CDBW-LABEL: testv64i8u:
+; AVX512CDBW: ## BB#0:
+; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: retq
+;
+; AVX512BW-LABEL: testv64i8u:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
%out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index b8024203ab2f..a71e3b7b712d 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -4,6 +4,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i16:
@@ -71,6 +72,11 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT: retq
entry:
%B = zext <16 x i8> %A to <16 x i16>
ret <16 x i16> %B
@@ -137,20 +143,21 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX1-LABEL: zext_16i8_to_8i32:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_8i32:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_16i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%C = zext <8 x i8> %B to <8 x i32>
@@ -215,20 +222,21 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; AVX1-LABEL: zext_16i8_to_4i64:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_4i64:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_16i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%C = zext <4 x i8> %B to <4 x i64>
@@ -300,6 +308,11 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = zext <8 x i16> %A to <8 x i32>
ret <8 x i32>%B
@@ -366,21 +379,21 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
;
; AVX1-LABEL: zext_8i16_to_4i64:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i16_to_4i64:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_8i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%C = zext <4 x i16> %B to <4 x i64>
@@ -452,6 +465,11 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: retq
entry:
%B = zext <4 x i32> %A to <4 x i64>
ret <4 x i64>%B
@@ -526,23 +544,20 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_zext_4i8_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i8_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,zero,zero,zero,zero,xmm1[12],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i8_to_4i64:
@@ -562,6 +577,11 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_4i8_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
entry:
%X = load <4 x i8>, <4 x i8>* %ptr
%Y = zext <4 x i8> %X to <4 x i64>
@@ -602,22 +622,21 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i32:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[6],zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[14],zero,zero,zero
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i32:
@@ -637,12 +656,137 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%X = load <8 x i8>, <8 x i8>* %ptr
%Y = zext <8 x i8> %X to <8 x i32>
ret <8 x i32> %Y
}
+define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_16i8_to_8i32:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movdqa (%rdi), %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_zext_16i8_to_8i32:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa (%rdi), %xmm1
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_zext_16i8_to_8i32:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: movdqa (%rdi), %xmm1
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_zext_16i8_to_8i32:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_zext_16i8_to_8i32:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_16i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX512-NEXT: retq
+entry:
+ %X = load <16 x i8>, <16 x i8>* %ptr
+ %Y = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %Z = zext <8 x i8> %Y to <8 x i32>
+ ret <8 x i32> %Z
+}
+
+define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_zext_8i8_to_8i64:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_zext_8i8_to_8i64:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
+; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
+; SSSE3-NEXT: pshufb %xmm4, %xmm2
+; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_zext_8i8_to_8i64:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: load_zext_8i8_to_8i64:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: load_zext_8i8_to_8i64:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_8i8_to_8i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
+entry:
+ %X = load <8 x i8>, <8 x i8>* %ptr
+ %Y = zext <8 x i8> %X to <8 x i64>
+ ret <8 x i64> %Y
+}
+
define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: load_zext_16i8_to_16i16:
; SSE2: # BB#0: # %entry
@@ -679,6 +823,11 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_16i8_to_16i16:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVX512-NEXT: retq
entry:
%X = load <16 x i8>, <16 x i8>* %ptr
%Y = zext <16 x i8> %X to <16 x i16>
@@ -751,21 +900,21 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_zext_4i16_to_4i64:
; SSE2: # BB#0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i16_to_4i64:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: pxor %xmm2, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9],zero,zero,zero,zero,zero,zero,xmm1[12,13],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i16_to_4i64:
@@ -785,6 +934,11 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_4i16_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%X = load <4 x i16>, <4 x i16>* %ptr
%Y = zext <4 x i16> %X to <4 x i64>
@@ -827,6 +981,11 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX512-NEXT: retq
entry:
%X = load <8 x i16>, <8 x i16>* %ptr
%Y = zext <8 x i16> %X to <8 x i32>
@@ -899,6 +1058,11 @@ define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_zext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX512-NEXT: retq
entry:
%X = load <4 x i32>, <4 x i32>* %ptr
%Y = zext <4 x i32> %X to <4 x i64>
@@ -949,6 +1113,12 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: zext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%t = zext <8 x i8> %z to <8 x i32>
ret <8 x i32> %t
@@ -991,6 +1161,11 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
%Z = bitcast <16 x i16> %B to <8 x i32>
@@ -1035,6 +1210,11 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_4i32_to_4i64:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
%Z = bitcast <8 x i32> %B to <4 x i64>
@@ -1057,9 +1237,8 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
; SSSE3: # BB#0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero
; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1088,6 +1267,12 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i8_to_8i32:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
%Z = bitcast <32 x i8> %B to <8 x i32>
@@ -1170,6 +1355,12 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable
; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%Z = bitcast <32 x i8> %B to <4 x i64>
@@ -1187,10 +1378,7 @@ define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable
;
; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
@@ -1254,6 +1442,12 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
%Z = bitcast <16 x i16> %B to <4 x i64>
@@ -1322,6 +1516,12 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable
; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
%Z = bitcast <16 x i16> %B to <8 x i32>
@@ -1369,6 +1569,12 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
%Z = bitcast <16 x i16> %B to <8 x i32>
@@ -1431,6 +1637,12 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1:
+; AVX512: # BB#0: # %entry
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: retq
entry:
%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
%Z = bitcast <8 x i32> %B to <4 x i64>
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index fe528fd4ea24..80930a72aa8a 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -1,270 +1,573 @@
-; RUN: llc < %s -march=x86-64 -mattr=sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86-64 -mattr=ssse3 | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -march=x86-64 -mattr=avx2 | FileCheck %s -check-prefix=AVX2
-; RUN: llc < %s -march=x86-64 -mattr=avx512f | FileCheck %s -check-prefix=AVX512
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
define <4 x i32> @test1(<4 x i32> %a) nounwind {
; SSE2-LABEL: test1:
-; SSE2: movdqa
-; SSE2: psrad $31
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test1:
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test1:
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test1:
-; AVX512: vpabsd
-; AVX512-NEXT: ret
- %tmp1neg = sub <4 x i32> zeroinitializer, %a
- %b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
- %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
- ret <4 x i32> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test1:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <4 x i32> zeroinitializer, %a
+ %b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
+ ret <4 x i32> %abs
}
define <4 x i32> @test2(<4 x i32> %a) nounwind {
; SSE2-LABEL: test2:
-; SSE2: movdqa
-; SSE2: psrad $31
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test2:
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test2:
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test2:
-; AVX512: vpabsd
-; AVX512-NEXT: ret
- %tmp1neg = sub <4 x i32> zeroinitializer, %a
- %b = icmp sge <4 x i32> %a, zeroinitializer
- %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
- ret <4 x i32> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test2:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <4 x i32> zeroinitializer, %a
+ %b = icmp sge <4 x i32> %a, zeroinitializer
+ %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
+ ret <4 x i32> %abs
}
define <8 x i16> @test3(<8 x i16> %a) nounwind {
; SSE2-LABEL: test3:
-; SSE2: movdqa
-; SSE2: psraw $15
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test3:
-; SSSE3: pabsw
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test3:
-; AVX2: vpabsw
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test3:
-; AVX512: vpabsw
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i16> zeroinitializer, %a
- %b = icmp sgt <8 x i16> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
- ret <8 x i16> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsw %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test3:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsw %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <8 x i16> zeroinitializer, %a
+ %b = icmp sgt <8 x i16> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
+ ret <8 x i16> %abs
}
define <16 x i8> @test4(<16 x i8> %a) nounwind {
; SSE2-LABEL: test4:
-; SSE2: pxor
-; SSE2: pcmpgtb
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; SSE2-NEXT: paddb %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test4:
-; SSSE3: pabsb
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test4:
-; AVX2: vpabsb
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test4:
-; AVX512: vpabsb
-; AVX512-NEXT: ret
- %tmp1neg = sub <16 x i8> zeroinitializer, %a
- %b = icmp slt <16 x i8> %a, zeroinitializer
- %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
- ret <16 x i8> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsb %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test4:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsb %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <16 x i8> zeroinitializer, %a
+ %b = icmp slt <16 x i8> %a, zeroinitializer
+ %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
+ ret <16 x i8> %abs
}
define <4 x i32> @test5(<4 x i32> %a) nounwind {
; SSE2-LABEL: test5:
-; SSE2: movdqa
-; SSE2: psrad $31
-; SSE2-NEXT: padd
-; SSE2-NEXT: pxor
-; SSE2-NEXT: ret
-
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test5:
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
-; AVX2-LABEL: test5:
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
-; AVX512-LABEL: test5:
-; AVX512: vpabsd
-; AVX512-NEXT: ret
- %tmp1neg = sub <4 x i32> zeroinitializer, %a
- %b = icmp sle <4 x i32> %a, zeroinitializer
- %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
- ret <4 x i32> %abs
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: test5:
+; AVX: # BB#0:
+; AVX-NEXT: vpabsd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %tmp1neg = sub <4 x i32> zeroinitializer, %a
+ %b = icmp sle <4 x i32> %a, zeroinitializer
+ %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
+ ret <4 x i32> %abs
}
define <8 x i32> @test6(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test6:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test6:
-; SSSE3: pabsd
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test6:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test6:
-; AVX2: vpabsd {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test6:
-; AVX512: vpabsd {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i32> zeroinitializer, %a
- %b = icmp sgt <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
- %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
- ret <8 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i32> zeroinitializer, %a
+ %b = icmp sgt <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
+ ret <8 x i32> %abs
}
define <8 x i32> @test7(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test7:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test7:
-; SSSE3: pabsd
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test7:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test7:
-; AVX2: vpabsd {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test7:
-; AVX512: vpabsd {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i32> zeroinitializer, %a
- %b = icmp sge <8 x i32> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
- ret <8 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i32> zeroinitializer, %a
+ %b = icmp sge <8 x i32> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
+ ret <8 x i32> %abs
}
define <16 x i16> @test8(<16 x i16> %a) nounwind {
+; SSE2-LABEL: test8:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test8:
-; SSSE3: pabsw
-; SSSE3: pabsw
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsw %xmm0, %xmm0
+; SSSE3-NEXT: pabsw %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test8:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2
+; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $15, %xmm0, %xmm3
+; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test8:
-; AVX2: vpabsw {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsw %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test8:
-; AVX512: vpabsw {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <16 x i16> zeroinitializer, %a
- %b = icmp sgt <16 x i16> %a, zeroinitializer
- %abs = select <16 x i1> %b, <16 x i16> %a, <16 x i16> %tmp1neg
- ret <16 x i16> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsw %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <16 x i16> zeroinitializer, %a
+ %b = icmp sgt <16 x i16> %a, zeroinitializer
+ %abs = select <16 x i1> %b, <16 x i16> %a, <16 x i16> %tmp1neg
+ ret <16 x i16> %abs
}
define <32 x i8> @test9(<32 x i8> %a) nounwind {
+; SSE2-LABEL: test9:
+; SSE2: # BB#0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
+; SSE2-NEXT: paddb %xmm3, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; SSE2-NEXT: paddb %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test9:
-; SSSE3: pabsb
-; SSSE3: pabsb
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsb %xmm0, %xmm0
+; SSSE3-NEXT: pabsb %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test9:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test9:
-; AVX2: vpabsb {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsb %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test9:
-; AVX512: vpabsb {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <32 x i8> zeroinitializer, %a
- %b = icmp slt <32 x i8> %a, zeroinitializer
- %abs = select <32 x i1> %b, <32 x i8> %tmp1neg, <32 x i8> %a
- ret <32 x i8> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsb %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <32 x i8> zeroinitializer, %a
+ %b = icmp slt <32 x i8> %a, zeroinitializer
+ %abs = select <32 x i1> %b, <32 x i8> %tmp1neg, <32 x i8> %a
+ ret <32 x i8> %abs
}
define <8 x i32> @test10(<8 x i32> %a) nounwind {
+; SSE2-LABEL: test10:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
; SSSE3-LABEL: test10:
-; SSSE3: pabsd
-; SSSE3: pabsd
-; SSSE3-NEXT: ret
-
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test10:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test10:
-; AVX2: vpabsd {{.*}}%ymm
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test10:
-; AVX512: vpabsd {{.*}}%ymm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i32> zeroinitializer, %a
- %b = icmp sle <8 x i32> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i32> %tmp1neg, <8 x i32> %a
- ret <8 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i32> zeroinitializer, %a
+ %b = icmp sle <8 x i32> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i32> %tmp1neg, <8 x i32> %a
+ ret <8 x i32> %abs
}
define <16 x i32> @test11(<16 x i32> %a) nounwind {
+; SSE2-LABEL: test11:
+; SSE2: # BB#0:
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrad $31, %xmm4
+; SSE2-NEXT: paddd %xmm4, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: test11:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: pabsd %xmm0, %xmm0
+; SSSE3-NEXT: pabsd %xmm1, %xmm1
+; SSSE3-NEXT: pabsd %xmm2, %xmm2
+; SSSE3-NEXT: pabsd %xmm3, %xmm3
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: test11:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test11:
-; AVX2: vpabsd
-; AVX2: vpabsd
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpabsd %ymm0, %ymm0
+; AVX2-NEXT: vpabsd %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test11:
-; AVX512: vpabsd {{.*}}%zmm
-; AVX512-NEXT: ret
- %tmp1neg = sub <16 x i32> zeroinitializer, %a
- %b = icmp sle <16 x i32> %a, zeroinitializer
- %abs = select <16 x i1> %b, <16 x i32> %tmp1neg, <16 x i32> %a
- ret <16 x i32> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsd %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <16 x i32> zeroinitializer, %a
+ %b = icmp sle <16 x i32> %a, zeroinitializer
+ %abs = select <16 x i1> %b, <16 x i32> %tmp1neg, <16 x i32> %a
+ ret <16 x i32> %abs
}
define <8 x i64> @test12(<8 x i64> %a) nounwind {
+; SSE-LABEL: test12:
+; SSE: # BB#0:
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test12:
+; AVX1: # BB#0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test12:
-; AVX2: vpxor
-; AVX2: vpxor
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test12:
-; AVX512: vpabsq {{.*}}%zmm
-; AVX512-NEXT: ret
- %tmp1neg = sub <8 x i64> zeroinitializer, %a
- %b = icmp sle <8 x i64> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
- ret <8 x i64> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsq %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %tmp1neg = sub <8 x i64> zeroinitializer, %a
+ %b = icmp sle <8 x i64> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+ ret <8 x i64> %abs
}
define <8 x i64> @test13(<8 x i64>* %a.ptr) nounwind {
+; SSE-LABEL: test13:
+; SSE: # BB#0:
+; SSE-NEXT: movdqu (%rdi), %xmm0
+; SSE-NEXT: movdqu 16(%rdi), %xmm1
+; SSE-NEXT: movdqu 32(%rdi), %xmm2
+; SSE-NEXT: movdqu 48(%rdi), %xmm3
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm2
+; SSE-NEXT: pxor %xmm4, %xmm2
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT: paddq %xmm4, %xmm3
+; SSE-NEXT: pxor %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: test13:
+; AVX1: # BB#0:
+; AVX1-NEXT: vmovups (%rdi), %ymm0
+; AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2
+; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
; AVX2-LABEL: test13:
-; AVX2: vpxor
-; AVX2: vpxor
-; AVX2-NEXT: ret
-
+; AVX2: # BB#0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7]
+; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
; AVX512-LABEL: test13:
-; AVX512: vpabsq (%
-; AVX512-NEXT: ret
- %a = load <8 x i64>, <8 x i64>* %a.ptr, align 8
- %tmp1neg = sub <8 x i64> zeroinitializer, %a
- %b = icmp sle <8 x i64> %a, zeroinitializer
- %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
- ret <8 x i64> %abs
+; AVX512: # BB#0:
+; AVX512-NEXT: vpabsq (%rdi), %zmm0
+; AVX512-NEXT: retq
+ %a = load <8 x i64>, <8 x i64>* %a.ptr, align 8
+ %tmp1neg = sub <8 x i64> zeroinitializer, %a
+ %b = icmp sle <8 x i64> %a, zeroinitializer
+ %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+ ret <8 x i64> %abs
}
diff --git a/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll b/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
index 2ff8c3a9028f..0eb17fb6c14d 100644
--- a/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
+++ b/test/CodeGen/X86/virtual-registers-cleared-in-machine-functions-liveins.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -o /dev/null -stop-after machine-scheduler %s | FileCheck %s --check-prefix=PRE-RA
-; RUN: llc -mtriple=x86_64-unknown-unknown -o /dev/null -stop-after prologepilog %s | FileCheck %s --check-prefix=POST-RA
+; RUN: llc -mtriple=x86_64-unknown-unknown -o - -stop-after machine-scheduler %s | FileCheck %s --check-prefix=PRE-RA
+; RUN: llc -mtriple=x86_64-unknown-unknown -o - -stop-after prologepilog %s | FileCheck %s --check-prefix=POST-RA
; This test verifies that the virtual register references in machine function's
; liveins are cleared after register allocation.
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index 002561042688..d9f783756d1e 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -1,23 +1,29 @@
-; RUN: llc %s -o - -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx"
; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true>
; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought
; we would lower that into a blend where only the high bit is relevant.
; However, since the whole mask is constant, this is simplified incorrectly
; by the generic code, because it was expecting -1 in place of 2147483648.
-;
+;
; The problem does not occur without AVX, because vselect of v4i32 is not legal
; nor custom.
;
; <rdar://problem/18675020>
-; CHECK-LABEL: test:
-; CHECK: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
-; CHECK: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
-; CHECK: ret
define void @test(<4 x i16>* %a, <4 x i16>* %b) {
+; CHECK-LABEL: test:
+; CHECK: ## BB#0: ## %body
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65533,124,125,14807]
+; CHECK-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; CHECK-NEXT: vmovq %xmm1, (%rdi)
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,65535]
+; CHECK-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vmovq %xmm0, (%rsi)
+; CHECK-NEXT: retq
body:
%predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>
%predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
@@ -31,17 +37,22 @@ body:
; When shrinking the condition used into the select to match a blend, this
; test case exercises the path where the modified node is not the root
; of the condition.
-;
-; CHECK-LABEL: test2:
-; CHECK: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
-; CHECK: vblendvpd [[MASK]]
-; CHECK: retq
+
define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
+; CHECK-LABEL: test2:
+; CHECK: ## BB#0: ## %bb
+; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
+; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: movq (%rdi,%rsi,8), %rax
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01]
+; CHECK-NEXT: vblendvpd %ymm0, {{.*}}(%rip), %ymm1, %ymm0
+; CHECK-NEXT: vmovupd %ymm0, (%rax)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
bb:
%arrayidx1928 = getelementptr inbounds double*, double** %call1559, i64 %indvars.iv4198
%tmp1888 = load double*, double** %arrayidx1928, align 8
@@ -57,22 +68,32 @@ bb:
; to be optimized into a and. In that case, the conditional mask was wrong.
;
; Make sure that the and is fed by the original mask.
-;
+;
; <rdar://problem/18819506>
-; CHECK-LABEL: test3:
-; Compute the mask.
-; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[MASK:%xmm[0-9]+]]
-; Do not shrink the bit of the mask.
-; CHECK-NOT: vpslld $31, [[MASK]], {{%xmm[0-9]+}}
-; Use the mask in the blend.
-; CHECK-NEXT: vblendvps [[MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; Shuffle mask to truncate.
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; CHECK: vpshufb %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
-; CHECK: retq
define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
+; CHECK-LABEL: test3:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; CHECK-NEXT: vpmuldq %xmm4, %xmm5, %xmm4
+; CHECK-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; CHECK-NEXT: vpsrld $31, %xmm3, %xmm4
+; CHECK-NEXT: vpaddd %xmm4, %xmm3, %xmm3
+; CHECK-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
+; CHECK-NEXT: vpsubd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vmovq %xmm0, (%rdi)
+; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vmovq %xmm0, (%rsi)
+; CHECK-NEXT: retq
%tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
%tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
%predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
@@ -85,11 +106,24 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
; We shouldn't try to lower this directly using VSELECT because we don't have
; vpblendvb in AVX1, only in AVX2. Instead, it should be expanded.
-;
-; CHECK-LABEL: PR22706:
-; CHECK: vpcmpgtb
-; CHECK: vpcmpgtb
+
define <32 x i8> @PR22706(<32 x i1> %x) {
+; CHECK-LABEL: PR22706:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
+; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
+; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm1
+; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %tmp
}
diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll
index edf2a442918a..8e9f1d980913 100644
--- a/test/CodeGen/X86/vselect-minmax.ll
+++ b/test/CodeGen/X86/vselect-minmax.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE4
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
diff --git a/test/CodeGen/X86/vzero-excess.ll b/test/CodeGen/X86/vzero-excess.ll
new file mode 100644
index 000000000000..0ed90741b61e
--- /dev/null
+++ b/test/CodeGen/X86/vzero-excess.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+
+; In the following 4 tests, the existing call to VZU/VZA ensures clean state before
+; the call to the unknown, so we don't need to insert a second VZU at that point.
+
+define <4 x float> @zeroupper_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
+; CHECK-LABEL: zeroupper_v4f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $48, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: addq $48, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroupper()
+ call void @the_unknown()
+ %loadx = load <8 x float>, <8 x float> *%x, align 32
+ %sum = fadd <8 x float> %loadx, %y
+ %lo = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %hi = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = fadd <4 x float> %lo, %hi
+ ret <4 x float> %res
+}
+
+define <8 x float> @zeroupper_v8f32(<8 x float> %x) nounwind {
+; CHECK-LABEL: zeroupper_v8f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroupper()
+ call void @the_unknown()
+ ret <8 x float> %x
+}
+
+define <4 x float> @zeroall_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
+; CHECK-LABEL: zeroall_v4f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $48, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: vzeroall
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: addq $48, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroall()
+ call void @the_unknown()
+ %loadx = load <8 x float>, <8 x float> *%x, align 32
+ %sum = fadd <8 x float> %loadx, %y
+ %lo = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %hi = shufflevector <8 x float> %sum, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = fadd <4 x float> %lo, %hi
+ ret <4 x float> %res
+}
+
+define <8 x float> @zeroall_v8f32(<8 x float> %x) nounwind {
+; CHECK-LABEL: zeroall_v8f32:
+; CHECK: # BB#0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
+; CHECK-NEXT: vzeroall
+; CHECK-NEXT: callq the_unknown
+; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx.vzeroall()
+ call void @the_unknown()
+ ret <8 x float> %x
+}
+
+declare void @llvm.x86.avx.vzeroupper() nounwind readnone
+declare void @llvm.x86.avx.vzeroall() nounwind readnone
+declare void @the_unknown() nounwind
+
diff --git a/test/CodeGen/X86/warn-stack.ll b/test/CodeGen/X86/warn-stack.ll
index aa09ad8066fe..7353d073e630 100644
--- a/test/CodeGen/X86/warn-stack.ll
+++ b/test/CodeGen/X86/warn-stack.ll
@@ -12,7 +12,7 @@ entry:
ret void
}
-; CHECK: warning: stack size limit exceeded (104) in warn
+; CHECK: warning: stack size limit exceeded (88) in warn
define void @warn() nounwind ssp {
entry:
%buffer = alloca [80 x i8], align 1
diff --git a/test/CodeGen/X86/weak_def_can_be_hidden.ll b/test/CodeGen/X86/weak_def_can_be_hidden.ll
index 8e6d34c89d88..516bc02cc2f8 100644
--- a/test/CodeGen/X86/weak_def_can_be_hidden.ll
+++ b/test/CodeGen/X86/weak_def_can_be_hidden.ll
@@ -4,7 +4,7 @@
; RUN: llc -mtriple=i686-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
; RUN: llc -mtriple=i686-apple-darwin8 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
-@v1 = linkonce_odr constant i32 32
+@v1 = linkonce_odr local_unnamed_addr constant i32 32
; CHECK: .globl _v1
; CHECK: .weak_def_can_be_hidden _v1
@@ -27,7 +27,7 @@ define i32* @f2() {
ret i32* @v2
}
-@v3 = linkonce_odr unnamed_addr global i32 32
+@v3 = linkonce_odr unnamed_addr constant i32 32
; CHECK: .globl _v3
; CHECK: .weak_def_can_be_hidden _v3
@@ -38,9 +38,9 @@ define i32* @f3() {
ret i32* @v3
}
-@v4 = linkonce_odr global i32 32
+@v4 = linkonce_odr unnamed_addr global i32 32
; CHECK: .globl _v4
-; CHECK: .weak_definition _v4
+; CHECK: .weak_def_can_be_hidden _v4
; CHECK-D89: .globl _v4
; CHECK-D89: .weak_definition _v4
diff --git a/test/CodeGen/X86/widen_bitops-0.ll b/test/CodeGen/X86/widen_bitops-0.ll
new file mode 100644
index 000000000000..f8316d0e1ea2
--- /dev/null
+++ b/test/CodeGen/X86/widen_bitops-0.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE42
+
+;
+; AND/XOR/OR i24 as v3i8
+;
+
+define i24 @and_i24_as_v3i8(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: and_i24_as_v3i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i24_as_v3i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <3 x i8>
+ %2 = bitcast i24 %b to <3 x i8>
+ %3 = and <3 x i8> %1, %2
+ %4 = bitcast <3 x i8> %3 to i24
+ ret i24 %4
+}
+
+define i24 @xor_i24_as_v3i8(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: xor_i24_as_v3i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i24_as_v3i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <3 x i8>
+ %2 = bitcast i24 %b to <3 x i8>
+ %3 = xor <3 x i8> %1, %2
+ %4 = bitcast <3 x i8> %3 to i24
+ ret i24 %4
+}
+
+define i24 @or_i24_as_v3i8(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: or_i24_as_v3i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i24_as_v3i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <3 x i8>
+ %2 = bitcast i24 %b to <3 x i8>
+ %3 = or <3 x i8> %1, %2
+ %4 = bitcast <3 x i8> %3 to i24
+ ret i24 %4
+}
+
+;
+; AND/XOR/OR i24 as v8i3
+;
+
+define i24 @and_i24_as_v8i3(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: and_i24_as_v8i3:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i24_as_v8i3:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <8 x i3>
+ %2 = bitcast i24 %b to <8 x i3>
+ %3 = and <8 x i3> %1, %2
+ %4 = bitcast <8 x i3> %3 to i24
+ ret i24 %4
+}
+
+define i24 @xor_i24_as_v8i3(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: xor_i24_as_v8i3:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i24_as_v8i3:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <8 x i3>
+ %2 = bitcast i24 %b to <8 x i3>
+ %3 = xor <8 x i3> %1, %2
+ %4 = bitcast <8 x i3> %3 to i24
+ ret i24 %4
+}
+
+define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind {
+; X32-SSE-LABEL: or_i24_as_v8i3:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i24_as_v8i3:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i24 %a to <8 x i3>
+ %2 = bitcast i24 %b to <8 x i3>
+ %3 = or <8 x i3> %1, %2
+ %4 = bitcast <8 x i3> %3 to i24
+ ret i24 %4
+}
+
+;
+; AND/XOR/OR v3i8 as i24
+;
+
+define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
+; X32-SSE-LABEL: and_v3i8_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pand %xmm0, %xmm1
+; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v3i8_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: movd %edi, %xmm1
+; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pand %xmm0, %xmm1
+; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: retq
+ %1 = bitcast <3 x i8> %a to i24
+ %2 = bitcast <3 x i8> %b to i24
+ %3 = and i24 %1, %2
+ %4 = bitcast i24 %3 to <3 x i8>
+ ret <3 x i8> %4
+}
+
+define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
+; X32-SSE-LABEL: xor_v3i8_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pxor %xmm0, %xmm1
+; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v3i8_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: movd %edi, %xmm1
+; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: pxor %xmm0, %xmm1
+; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: retq
+ %1 = bitcast <3 x i8> %a to i24
+ %2 = bitcast <3 x i8> %b to i24
+ %3 = xor i24 %1, %2
+ %4 = bitcast i24 %3 to <3 x i8>
+ ret <3 x i8> %4
+}
+
+define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
+; X32-SSE-LABEL: or_v3i8_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT: por %xmm0, %xmm1
+; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v3i8_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: movd %ecx, %xmm0
+; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
+; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
+; X64-SSE-NEXT: movd %edi, %xmm1
+; X64-SSE-NEXT: pinsrd $1, %esi, %xmm1
+; X64-SSE-NEXT: pinsrd $2, %edx, %xmm1
+; X64-SSE-NEXT: por %xmm0, %xmm1
+; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
+; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
+; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
+; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
+; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: retq
+ %1 = bitcast <3 x i8> %a to i24
+ %2 = bitcast <3 x i8> %b to i24
+ %3 = or i24 %1, %2
+ %4 = bitcast i24 %3 to <3 x i8>
+ ret <3 x i8> %4
+}
+
+;
+; AND/XOR/OR v8i3 as i24
+;
+
+define <8 x i3> @and_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
+; X32-SSE-LABEL: and_v8i3_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: andps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v8i3_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i3> %a to i24
+ %2 = bitcast <8 x i3> %b to i24
+ %3 = and i24 %1, %2
+ %4 = bitcast i24 %3 to <8 x i3>
+ ret <8 x i3> %4
+}
+
+define <8 x i3> @xor_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
+; X32-SSE-LABEL: xor_v8i3_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v8i3_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i3> %a to i24
+ %2 = bitcast <8 x i3> %b to i24
+ %3 = xor i24 %1, %2
+ %4 = bitcast i24 %3 to <8 x i3>
+ ret <8 x i3> %4
+}
+
+define <8 x i3> @or_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
+; X32-SSE-LABEL: or_v8i3_as_i24:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: orps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v8i3_as_i24:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i3> %a to i24
+ %2 = bitcast <8 x i3> %b to i24
+ %3 = or i24 %1, %2
+ %4 = bitcast i24 %3 to <8 x i3>
+ ret <8 x i3> %4
+}
diff --git a/test/CodeGen/X86/widen_bitops-1.ll b/test/CodeGen/X86/widen_bitops-1.ll
new file mode 100644
index 000000000000..f2a6b22c2af4
--- /dev/null
+++ b/test/CodeGen/X86/widen_bitops-1.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE42
+
+;
+; AND/XOR/OR i32 as v4i8
+;
+
+define i32 @and_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: and_i32_as_v4i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i32_as_v4i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <4 x i8>
+ %2 = bitcast i32 %b to <4 x i8>
+ %3 = and <4 x i8> %1, %2
+ %4 = bitcast <4 x i8> %3 to i32
+ ret i32 %4
+}
+
+define i32 @xor_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: xor_i32_as_v4i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i32_as_v4i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <4 x i8>
+ %2 = bitcast i32 %b to <4 x i8>
+ %3 = xor <4 x i8> %1, %2
+ %4 = bitcast <4 x i8> %3 to i32
+ ret i32 %4
+}
+
+define i32 @or_i32_as_v4i8(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: or_i32_as_v4i8:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i32_as_v4i8:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <4 x i8>
+ %2 = bitcast i32 %b to <4 x i8>
+ %3 = or <4 x i8> %1, %2
+ %4 = bitcast <4 x i8> %3 to i32
+ ret i32 %4
+}
+
+;
+; AND/XOR/OR i32 as v8i4
+;
+
+define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: and_i32_as_v8i4:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_i32_as_v8i4:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <8 x i4>
+ %2 = bitcast i32 %b to <8 x i4>
+ %3 = and <8 x i4> %1, %2
+ %4 = bitcast <8 x i4> %3 to i32
+ ret i32 %4
+}
+
+define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: xor_i32_as_v8i4:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_i32_as_v8i4:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <8 x i4>
+ %2 = bitcast i32 %b to <8 x i4>
+ %3 = xor <8 x i4> %1, %2
+ %4 = bitcast <8 x i4> %3 to i32
+ ret i32 %4
+}
+
+define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind {
+; X32-SSE-LABEL: or_i32_as_v8i4:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_i32_as_v8i4:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orl %esi, %edi
+; X64-SSE-NEXT: movl %edi, %eax
+; X64-SSE-NEXT: retq
+ %1 = bitcast i32 %a to <8 x i4>
+ %2 = bitcast i32 %b to <8 x i4>
+ %3 = or <8 x i4> %1, %2
+ %4 = bitcast <8 x i4> %3 to i32
+ ret i32 %4
+}
+
+;
+; AND/XOR/OR v4i8 as i32
+;
+
+define <4 x i8> @and_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: and_v4i8_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: andps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v4i8_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <4 x i8> %a to i32
+ %2 = bitcast <4 x i8> %b to i32
+ %3 = and i32 %1, %2
+ %4 = bitcast i32 %3 to <4 x i8>
+ ret <4 x i8> %4
+}
+
+define <4 x i8> @xor_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: xor_v4i8_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v4i8_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <4 x i8> %a to i32
+ %2 = bitcast <4 x i8> %b to i32
+ %3 = xor i32 %1, %2
+ %4 = bitcast i32 %3 to <4 x i8>
+ ret <4 x i8> %4
+}
+
+define <4 x i8> @or_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
+; X32-SSE-LABEL: or_v4i8_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: orps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v4i8_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <4 x i8> %a to i32
+ %2 = bitcast <4 x i8> %b to i32
+ %3 = or i32 %1, %2
+ %4 = bitcast i32 %3 to <4 x i8>
+ ret <4 x i8> %4
+}
+
+;
+; AND/XOR/OR v8i4 as i32
+;
+
+define <8 x i4> @and_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: and_v8i4_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: andps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: and_v8i4_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: andps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i4> %a to i32
+ %2 = bitcast <8 x i4> %b to i32
+ %3 = and i32 %1, %2
+ %4 = bitcast i32 %3 to <8 x i4>
+ ret <8 x i4> %4
+}
+
+define <8 x i4> @xor_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: xor_v8i4_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: xorps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: xor_v8i4_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: xorps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i4> %a to i32
+ %2 = bitcast <8 x i4> %b to i32
+ %3 = xor i32 %1, %2
+ %4 = bitcast i32 %3 to <8 x i4>
+ ret <8 x i4> %4
+}
+
+define <8 x i4> @or_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
+; X32-SSE-LABEL: or_v8i4_as_i32:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: orps %xmm1, %xmm0
+; X32-SSE-NEXT: retl
+;
+; X64-SSE-LABEL: or_v8i4_as_i32:
+; X64-SSE: # BB#0:
+; X64-SSE-NEXT: orps %xmm1, %xmm0
+; X64-SSE-NEXT: retq
+ %1 = bitcast <8 x i4> %a to i32
+ %2 = bitcast <8 x i4> %b to i32
+ %3 = or i32 %1, %2
+ %4 = bitcast i32 %3 to <8 x i4>
+ ret <8 x i4> %4
+}
diff --git a/test/CodeGen/X86/widen_compare-1.ll b/test/CodeGen/X86/widen_compare-1.ll
new file mode 100644
index 000000000000..8ea0db53a391
--- /dev/null
+++ b/test/CodeGen/X86/widen_compare-1.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
+
+; compare v2i16
+
+define <2 x i16> @compare_v2i64_to_v2i16(<2 x i16>* %src) nounwind {
+; X86-LABEL: compare_v2i64_to_v2i16:
+; X86: # BB#0:
+; X86-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0]
+; X86-NEXT: retl
+;
+; X64-LABEL: compare_v2i64_to_v2i16:
+; X64: # BB#0:
+; X64-NEXT: movaps {{.*#+}} xmm0 = [65535,65535]
+; X64-NEXT: retq
+ %val = load <2 x i16>, <2 x i16>* %src, align 4
+ %cmp = icmp uge <2 x i16> %val, %val
+ %sel = select <2 x i1> %cmp, <2 x i16> <i16 -1, i16 -1>, <2 x i16> zeroinitializer
+ ret <2 x i16> %sel
+}
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 3f54ab694c07..cf5a8abda18c 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -1,12 +1,101 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: paddd
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
; truncate v2i64 to v2i32
-define void @convert(<2 x i32>* %dst.addr, <2 x i64> %src) nounwind {
+define void @convert_v2i64_to_v2i32(<2 x i32>* %dst.addr, <2 x i64> %src) nounwind {
+; X86-LABEL: convert_v2i64_to_v2i32:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: paddd .LCPI0_0, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v2i64_to_v2i32:
+; X64: # BB#0: # %entry
+; X64-NEXT: paddd {{.*}}(%rip), %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
%val = trunc <2 x i64> %src to <2 x i32>
%add = add <2 x i32> %val, < i32 1, i32 1 >
store <2 x i32> %add, <2 x i32>* %dst.addr
ret void
}
+
+; truncate v3i32 to v3i8
+
+define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) nounwind {
+; X86-LABEL: convert_v3i32_to_v3i8:
+; X86: # BB#0: # %entry
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movdqa (%ecx), %xmm0
+; X86-NEXT: paddd .LCPI1_0, %xmm0
+; X86-NEXT: pextrb $8, %xmm0, 2(%eax)
+; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X86-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: movw %cx, (%eax)
+; X86-NEXT: popl %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v3i32_to_v3i8:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa (%rsi), %xmm0
+; X64-NEXT: paddd {{.*}}(%rip), %xmm0
+; X64-NEXT: pextrb $8, %xmm0, 2(%rdi)
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movw %ax, (%rdi)
+; X64-NEXT: retq
+entry:
+ %load = load <3 x i32>, <3 x i32>* %src.addr
+ %val = trunc <3 x i32> %load to <3 x i8>
+ %add = add <3 x i8> %val, < i8 1, i8 1, i8 1 >
+ store <3 x i8> %add, <3 x i8>* %dst.addr
+ ret void
+}
+
+; truncate v5i16 to v5i8
+
+define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind {
+; X86-LABEL: convert_v5i16_to_v5i8:
+; X86: # BB#0: # %entry
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movdqa (%ecx), %xmm0
+; X86-NEXT: paddw .LCPI2_0, %xmm0
+; X86-NEXT: pextrb $8, %xmm0, 4(%eax)
+; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X86-NEXT: movd %xmm0, (%eax)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v5i16_to_v5i8:
+; X64: # BB#0: # %entry
+; X64-NEXT: movdqa (%rsi), %xmm0
+; X64-NEXT: paddw {{.*}}(%rip), %xmm0
+; X64-NEXT: pextrb $8, %xmm0, 4(%rdi)
+; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT: movd %xmm0, (%rdi)
+; X64-NEXT: retq
+entry:
+ %load = load <5 x i16>, <5 x i16>* %src.addr
+ %val = trunc <5 x i16> %load to <5 x i8>
+ %add = add <5 x i8> %val, < i8 1, i8 1, i8 1, i8 1, i8 1 >
+ store <5 x i8> %add, <5 x i8>* %dst.addr
+ ret void
+}
diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll
index c8646c6489a1..015b0faa9827 100644
--- a/test/CodeGen/X86/widen_conv-2.ll
+++ b/test/CodeGen/X86/widen_conv-2.ll
@@ -1,11 +1,26 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: psllq $48, %xmm0
-; CHECK: psrad $16, %xmm0
-; CHECK: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
; sign extension v2i16 to v2i32
-define void @convert(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind {
+define void @convert_v2i16_v2i32(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind {
+; X86-LABEL: convert_v2i16_v2i32:
+; X86: # BB#0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: psllq $48, %xmm0
+; X86-NEXT: psrad $16, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: convert_v2i16_v2i32:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllq $48, %xmm0
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: movq %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
%signext = sext <2 x i16> %src to <2 x i32> ; <<12 x i8>> [#uses=1]
store <2 x i32> %signext, <2 x i32>* %dst.addr
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index 0a6eea049d37..e8fa1043e9f0 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -1,11 +1,150 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: cvtdq2ps
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE42
; sign to float v2i16 to v2f32
-define void @convert(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
+define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
+; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: psllq $48, %xmm0
+; X86-SSE2-NEXT: psrad $16, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movss %xmm0, (%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: psllq $48, %xmm0
+; X86-SSE42-NEXT: psrad $16, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
+; X86-SSE42-NEXT: movss %xmm0, (%eax)
+; X86-SSE42-NEXT: retl
+;
+; X64-LABEL: convert_v2i16_to_v2f32:
+; X64: # BB#0: # %entry
+; X64-NEXT: psllq $48, %xmm0
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X64-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-NEXT: movlps %xmm0, (%rdi)
+; X64-NEXT: retq
entry:
%val = sitofp <2 x i16> %src to <2 x float>
- store <2 x float> %val, <2 x float>* %dst.addr
+ store <2 x float> %val, <2 x float>* %dst.addr, align 4
+ ret void
+}
+
+; sign to float v3i8 to v3f32
+
+define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
+; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $32, %esp
+; X86-SSE2-NEXT: movl 8(%ebp), %eax
+; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movzwl (%ecx), %edx
+; X86-SSE2-NEXT: movd %edx, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT: shll $8, %edx
+; X86-SSE2-NEXT: movzbl (%esp), %esi
+; X86-SSE2-NEXT: orl %edx, %esi
+; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
+; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: psrad $24, %xmm0
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movss %xmm0, (%eax)
+; X86-SSE2-NEXT: movaps %xmm0, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
+; X86-SSE2-NEXT: leal -4(%ebp), %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: pushl %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
+; X86-SSE42-NEXT: movzwl (%ecx), %ecx
+; X86-SSE42-NEXT: movd %ecx, %xmm0
+; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
+; X86-SSE42-NEXT: pslld $24, %xmm0
+; X86-SSE42-NEXT: psrad $24, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
+; X86-SSE42-NEXT: movss %xmm0, (%eax)
+; X86-SSE42-NEXT: popl %eax
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE2: # BB#0: # %entry
+; X64-SSE2-NEXT: movzwl (%rsi), %eax
+; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE2-NEXT: shll $8, %eax
+; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE2-NEXT: orl %eax, %ecx
+; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: psrad $24, %xmm0
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE42: # BB#0: # %entry
+; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE42-NEXT: movzwl (%rsi), %ecx
+; X64-SSE42-NEXT: movd %rcx, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
+; X64-SSE42-NEXT: pslld $24, %xmm0
+; X64-SSE42-NEXT: psrad $24, %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
+; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE42-NEXT: retq
+entry:
+ %load = load <3 x i8>, <3 x i8>* %src.addr, align 1
+ %cvt = sitofp <3 x i8> %load to <3 x float>
+ store <3 x float> %cvt, <3 x float>* %dst.addr, align 4
ret void
}
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index f633592f2ef8..71b7976ab8bd 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -1,11 +1,174 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
-; CHECK-NOT: cvtsi2ss
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE42
; unsigned to float v7i16 to v7f32
-define void @convert(<7 x float>* %dst.addr, <7 x i16> %src) nounwind {
+define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwind {
+; X86-SSE2-LABEL: convert_v7i16_v7f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movups %xmm0, (%eax)
+; X86-SSE2-NEXT: movss %xmm2, 16(%eax)
+; X86-SSE2-NEXT: movaps %xmm2, %xmm0
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X86-SSE2-NEXT: movss %xmm0, 24(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm2, 20(%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v7i16_v7f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: pxor %xmm1, %xmm1
+; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
+; X86-SSE42-NEXT: extractps $2, %xmm0, 24(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm0, 20(%eax)
+; X86-SSE42-NEXT: movups %xmm1, (%eax)
+; X86-SSE42-NEXT: movss %xmm0, 16(%eax)
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v7i16_v7f32:
+; X64-SSE2: # BB#0: # %entry
+; X64-SSE2-NEXT: pxor %xmm1, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
+; X64-SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, 16(%rdi)
+; X64-SSE2-NEXT: movups %xmm2, (%rdi)
+; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movss %xmm0, 24(%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v7i16_v7f32:
+; X64-SSE42: # BB#0: # %entry
+; X64-SSE42-NEXT: pxor %xmm1, %xmm1
+; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1
+; X64-SSE42-NEXT: extractps $2, %xmm0, 24(%rdi)
+; X64-SSE42-NEXT: movlps %xmm0, 16(%rdi)
+; X64-SSE42-NEXT: movups %xmm1, (%rdi)
+; X64-SSE42-NEXT: retq
entry:
- %val = sitofp <7 x i16> %src to <7 x float>
- store <7 x float> %val, <7 x float>* %dst.addr
+ %val = uitofp <7 x i16> %src to <7 x float>
+ store <7 x float> %val, <7 x float>* %dst.addr, align 4
+ ret void
+}
+
+; unsigned to float v3i8 to v3f32
+
+define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
+; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE2: # BB#0: # %entry
+; X86-SSE2-NEXT: pushl %ebp
+; X86-SSE2-NEXT: movl %esp, %ebp
+; X86-SSE2-NEXT: pushl %esi
+; X86-SSE2-NEXT: andl $-16, %esp
+; X86-SSE2-NEXT: subl $32, %esp
+; X86-SSE2-NEXT: movl 8(%ebp), %eax
+; X86-SSE2-NEXT: movl 12(%ebp), %ecx
+; X86-SSE2-NEXT: movzwl (%ecx), %edx
+; X86-SSE2-NEXT: movd %edx, %xmm0
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-SSE2-NEXT: movdqa %xmm0, (%esp)
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE2-NEXT: shll $8, %edx
+; X86-SSE2-NEXT: movzbl (%esp), %esi
+; X86-SSE2-NEXT: orl %edx, %esi
+; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx
+; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
+; X86-SSE2-NEXT: pxor %xmm1, %xmm1
+; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE2-NEXT: movss %xmm0, (%eax)
+; X86-SSE2-NEXT: movaps %xmm0, %xmm1
+; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
+; X86-SSE2-NEXT: leal -4(%ebp), %esp
+; X86-SSE2-NEXT: popl %esi
+; X86-SSE2-NEXT: popl %ebp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X86-SSE42: # BB#0: # %entry
+; X86-SSE42-NEXT: pushl %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE42-NEXT: movzbl 2(%ecx), %edx
+; X86-SSE42-NEXT: movzwl (%ecx), %ecx
+; X86-SSE42-NEXT: movd %ecx, %xmm0
+; X86-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-SSE42-NEXT: pinsrd $2, %edx, %xmm0
+; X86-SSE42-NEXT: pand .LCPI1_0, %xmm0
+; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax)
+; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax)
+; X86-SSE42-NEXT: movss %xmm0, (%eax)
+; X86-SSE42-NEXT: popl %eax
+; X86-SSE42-NEXT: retl
+;
+; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE2: # BB#0: # %entry
+; X64-SSE2-NEXT: movzwl (%rsi), %eax
+; X64-SSE2-NEXT: movd %rax, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-SSE2-NEXT: shll $8, %eax
+; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-SSE2-NEXT: orl %eax, %ecx
+; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
+; X64-SSE2-NEXT: pxor %xmm1, %xmm1
+; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
+; X64-SSE42: # BB#0: # %entry
+; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
+; X64-SSE42-NEXT: movzwl (%rsi), %ecx
+; X64-SSE42-NEXT: movd %rcx, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
+; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0
+; X64-SSE42-NEXT: extractps $2, %xmm0, 8(%rdi)
+; X64-SSE42-NEXT: movlps %xmm0, (%rdi)
+; X64-SSE42-NEXT: retq
+entry:
+ %load = load <3 x i8>, <3 x i8>* %src.addr, align 1
+ %cvt = uitofp <3 x i8> %load to <3 x float>
+ store <3 x float> %cvt, <3 x float>* %dst.addr, align 4
ret void
}
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index c670b45df747..810e409c175c 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
-; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
+; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
; PR4891
; PR5626
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index fad1fa32559a..00aeb009b638 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s
; Test based on pr5626 to load/store
;
@@ -6,10 +7,13 @@
%i32vec3 = type <3 x i32>
define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; CHECK-LABEL: add3i32:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
-; CHECK-NEXT: pextrd $2, %[[R0]], 8(%{{.*}})
-; CHECK-NEXT: movq %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: paddd (%rdx), %xmm0
+; CHECK-NEXT: pextrd $2, %xmm0, 8(%rdi)
+; CHECK-NEXT: movq %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec3, %i32vec3* %ap, align 16
%b = load %i32vec3, %i32vec3* %bp, align 16
%x = add %i32vec3 %a, %b
@@ -19,13 +23,16 @@ define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; CHECK-LABEL: add3i32_2:
-; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R0]]
-; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R1]]
-; CHECK-NEXT: paddd %[[R0]], %[[R1]]
-; CHECK-NEXT: pextrd $2, %[[R1]], 8(%{{.*}})
-; CHECK-NEXT: movq %[[R1]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: pinsrd $2, 8(%rsi), %xmm0
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: pinsrd $2, 8(%rdx), %xmm1
+; CHECK-NEXT: paddd %xmm0, %xmm1
+; CHECK-NEXT: pextrd $2, %xmm1, 8(%rdi)
+; CHECK-NEXT: movq %xmm1, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec3, %i32vec3* %ap, align 8
%b = load %i32vec3, %i32vec3* %bp, align 8
%x = add %i32vec3 %a, %b
@@ -36,13 +43,16 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
%i32vec7 = type <7 x i32>
define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
; CHECK-LABEL: add7i32:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
-; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: paddd (%rdx), %xmm0
+; CHECK-NEXT: paddd 16(%rdx), %xmm1
+; CHECK-NEXT: pextrd $2, %xmm1, 24(%rdi)
+; CHECK-NEXT: movq %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec7, %i32vec7* %ap, align 16
%b = load %i32vec7, %i32vec7* %bp, align 16
%x = add %i32vec7 %a, %b
@@ -53,15 +63,18 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
%i32vec12 = type <12 x i32>
define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
; CHECK-LABEL: add12i32:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
-; CHECK-NEXT: paddd (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: paddd 32(%{{.*}}), %[[R2]]
-; CHECK-NEXT: movdqa %[[R2]], 32(%{{.*}})
-; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: movdqa 32(%rsi), %xmm2
+; CHECK-NEXT: paddd (%rdx), %xmm0
+; CHECK-NEXT: paddd 16(%rdx), %xmm1
+; CHECK-NEXT: paddd 32(%rdx), %xmm2
+; CHECK-NEXT: movdqa %xmm2, 32(%rdi)
+; CHECK-NEXT: movdqa %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i32vec12, %i32vec12* %ap, align 16
%b = load %i32vec12, %i32vec12* %bp, align 16
%x = add %i32vec12 %a, %b
@@ -73,13 +86,16 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
%i16vec3 = type <3 x i16>
define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
; CHECK-LABEL: add3i16:
-; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddd %[[R0]], %[[R1]]
-; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})
-; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
-; CHECK-NEXT: pmovzxdq %[[R1]], %[[R0]]
-; CHECK-NEXT: movd %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: paddd %xmm0, %xmm1
+; CHECK-NEXT: pextrw $4, %xmm1, 4(%rdi)
+; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; CHECK-NEXT: movd %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec3, %i16vec3* %ap, align 16
%b = load %i16vec3, %i16vec3* %bp, align 16
%x = add %i16vec3 %a, %b
@@ -90,10 +106,13 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
%i16vec4 = type <4 x i16>
define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
; CHECK-LABEL: add4i16:
-; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddw %[[R0]], %[[R1]]
-; CHECK-NEXT: movq %[[R1]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: paddw %xmm0, %xmm1
+; CHECK-NEXT: movq %xmm1, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec4, %i16vec4* %ap, align 16
%b = load %i16vec4, %i16vec4* %bp, align 16
%x = add %i16vec4 %a, %b
@@ -104,12 +123,15 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp
%i16vec12 = type <12 x i16>
define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
; CHECK-LABEL: add12i16:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: paddw (%rdx), %xmm0
+; CHECK-NEXT: paddw 16(%rdx), %xmm1
+; CHECK-NEXT: movq %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec12, %i16vec12* %ap, align 16
%b = load %i16vec12, %i16vec12* %bp, align 16
%x = add %i16vec12 %a, %b
@@ -120,15 +142,18 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
%i16vec18 = type <18 x i16>
define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
; CHECK-LABEL: add18i16:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]]
-; CHECK-NEXT: paddw (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: paddw 32(%{{.*}}), %[[R2]]
-; CHECK-NEXT: movd %[[R2]], 32(%{{.*}})
-; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: movdqa 32(%rsi), %xmm2
+; CHECK-NEXT: paddw (%rdx), %xmm0
+; CHECK-NEXT: paddw 16(%rdx), %xmm1
+; CHECK-NEXT: paddw 32(%rdx), %xmm2
+; CHECK-NEXT: movd %xmm2, 32(%rdi)
+; CHECK-NEXT: movdqa %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i16vec18, %i16vec18* %ap, align 16
%b = load %i16vec18, %i16vec18* %bp, align 16
%x = add %i16vec18 %a, %b
@@ -140,14 +165,17 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
%i8vec3 = type <3 x i8>
define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
; CHECK-LABEL: add3i8:
-; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddd %[[R0]], %[[R1]]
-; CHECK-NEXT: pextrb $8, %[[R1]], 2(%{{.*}})
-; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
-; CHECK-NEXT: pmovzxwq %[[R1]], %[[R0]]
-; CHECK-NEXT: movd %[[R0]], %e[[R2:[abcd]]]x
-; CHECK-NEXT: movw %[[R2]]x, (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: paddd %xmm0, %xmm1
+; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi)
+; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movw %ax, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i8vec3, %i8vec3* %ap, align 16
%b = load %i8vec3, %i8vec3* %bp, align 16
%x = add %i8vec3 %a, %b
@@ -158,15 +186,18 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
%i8vec31 = type <31 x i8>
define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
; CHECK-LABEL: add31i8:
-; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]]
-; CHECK-NEXT: paddb (%{{.*}}), %[[R0]]
-; CHECK-NEXT: paddb 16(%{{.*}}), %[[R1]]
-; CHECK-NEXT: pextrb $14, %[[R1]], 30(%{{.*}})
-; CHECK-NEXT: pextrw $6, %[[R1]], 28(%{{.*}})
-; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}})
-; CHECK-NEXT: movq %[[R1]], 16(%{{.*}})
-; CHECK-NEXT: movdqa %[[R0]], (%{{.*}})
+; CHECK: # BB#0:
+; CHECK-NEXT: movdqa (%rsi), %xmm0
+; CHECK-NEXT: movdqa 16(%rsi), %xmm1
+; CHECK-NEXT: paddb (%rdx), %xmm0
+; CHECK-NEXT: paddb 16(%rdx), %xmm1
+; CHECK-NEXT: pextrb $14, %xmm1, 30(%rdi)
+; CHECK-NEXT: pextrw $6, %xmm1, 28(%rdi)
+; CHECK-NEXT: pextrd $2, %xmm1, 24(%rdi)
+; CHECK-NEXT: movq %xmm1, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm0, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%a = load %i8vec31, %i8vec31* %ap, align 16
%b = load %i8vec31, %i8vec31* %bp, align 16
%x = add %i8vec31 %a, %b
@@ -178,29 +209,31 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
%i8vec3pack = type { <3 x i8>, i8 }
define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
; CHECK-LABEL: rot:
-; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]]
-; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]]
-; CHECK-NEXT: pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]]
-; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x
-; CHECK-NEXT: movw %[[R0]]x, (%[[PTR0:.*]])
-; CHECK-NEXT: movb $-98, 2(%[[PTR0]])
-; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]]
-; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]]
-; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]]
-; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x
-; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]])
-; CHECK-NEXT: movb $1, 2(%[[PTR1]])
-; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
-; CHECK-NEXT: movdqa %[[X0]], %[[X1:xmm[0-9]+]]
-; CHECK-NEXT: psrld $1, %[[X1]]
-; CHECK-NEXT: pblendw $192, %[[X0]], %[[X1]]
-; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}})
-; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X1]]
-; CHECK-NEXT: pmovzxwq %[[X1]], %[[X3:xmm[0-9]+]]
-; CHECK-NEXT: movd %[[X3]], %e[[R0:[abcd]]]x
-; CHECK-NEXT: movw %[[R0]]x, (%{{.*}})
-
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,128,u,u,u,u,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <158,158,158,u>
+; CHECK-NEXT: pshufb %xmm0, %xmm1
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm1, %eax
+; CHECK-NEXT: movw %ax, (%rsi)
+; CHECK-NEXT: movb $-98, 2(%rsi)
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <1,1,1,u>
+; CHECK-NEXT: pshufb %xmm0, %xmm1
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movw %ax, (%rdx)
+; CHECK-NEXT: movb $1, 2(%rdx)
+; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: movdqa %xmm0, %xmm1
+; CHECK-NEXT: psrld $1, %xmm1
+; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: pextrb $8, %xmm1, 2(%rdi)
+; CHECK-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movw %ax, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
entry:
%storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
diff --git a/test/CodeGen/X86/win-alloca-expander.ll b/test/CodeGen/X86/win-alloca-expander.ll
new file mode 100644
index 000000000000..45ca3b214ab8
--- /dev/null
+++ b/test/CodeGen/X86/win-alloca-expander.ll
@@ -0,0 +1,154 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-win32 -O0
+
+%struct.S = type { [1024 x i8] }
+%struct.T = type { [3000 x i8] }
+%struct.U = type { [10000 x i8] }
+
+define void @basics() {
+; CHECK-LABEL: basics:
+entry:
+ br label %bb1
+
+; Allocation move sizes should have been removed.
+; CHECK-NOT: movl $1024
+; CHECK-NOT: movl $3000
+
+bb1:
+ %p0 = alloca %struct.S
+; The allocation is small enough not to require stack probing, but the %esp
+; offset after the prologue is not known, so the stack must be touched before
+; the pointer is adjusted.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+
+ %saved_stack = tail call i8* @llvm.stacksave()
+
+ %p1 = alloca %struct.S
+; We know the %esp offset from above, so there is no need to touch the stack
+; before adjusting it.
+; CHECK: subl $1024, %esp
+
+ %p2 = alloca %struct.T
+; The offset is now 2048 bytes, so allocating a T must touch the stack again.
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+
+ call void @f(%struct.S* %p0)
+; CHECK: calll
+
+ %p3 = alloca %struct.T
+; The call above touched the stack, so there is room for a T object.
+; CHECK: subl $3000, %esp
+
+ %p4 = alloca %struct.U
+; The U object is large enough to require stack probing.
+; CHECK: movl $10000, %eax
+; CHECK: calll __chkstk
+
+ %p5 = alloca %struct.T
+; The stack probing above touched the tip of the stack, so there's room for a T.
+; CHECK: subl $3000, %esp
+
+ call void @llvm.stackrestore(i8* %saved_stack)
+ %p6 = alloca %struct.S
+; The stack restore means we lose track of the stack pointer and must probe.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+
+; Use the pointers so they're not optimized away.
+ call void @f(%struct.S* %p1)
+ call void @g(%struct.T* %p2)
+ call void @g(%struct.T* %p3)
+ call void @h(%struct.U* %p4)
+ call void @g(%struct.T* %p5)
+ ret void
+}
+
+define void @loop() {
+; CHECK-LABEL: loop:
+entry:
+ br label %bb1
+
+bb1:
+ %p1 = alloca %struct.S
+; The entry offset is unknown; touch-and-sub.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+ br label %loop1
+
+loop1:
+ %i1 = phi i32 [ 10, %bb1 ], [ %dec1, %loop1 ]
+ %p2 = alloca %struct.S
+; We know the incoming offset from bb1, but from the back-edge, we assume the
+; worst, and therefore touch-and-sub to allocate.
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+ %dec1 = sub i32 %i1, 1
+ %cmp1 = icmp sgt i32 %i1, 0
+ br i1 %cmp1, label %loop1, label %end
+; CHECK: decl
+; CHECK: jg
+
+end:
+ call void @f(%struct.S* %p1)
+ call void @f(%struct.S* %p2)
+ ret void
+}
+
+define void @probe_size_attribute() "stack-probe-size"="512" {
+; CHECK-LABEL: probe_size_attribute:
+entry:
+ br label %bb1
+
+bb1:
+ %p0 = alloca %struct.S
+; The allocation would be small enough not to require probing, if it wasn't
+; for the stack-probe-size attribute.
+; CHECK: movl $1024, %eax
+; CHECK: calll __chkstk
+ call void @f(%struct.S* %p0)
+ ret void
+}
+
+define void @cfg(i1 %x, i1 %y) {
+; Test that the blocks are analyzed in the correct order.
+; CHECK-LABEL: cfg:
+entry:
+ br i1 %x, label %bb1, label %bb2
+
+bb1:
+ %p1 = alloca %struct.S
+; CHECK: pushl %eax
+; CHECK: subl $1020, %esp
+ br label %bb3
+bb2:
+ %p2 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+ br label %bb3
+
+bb3:
+ br i1 %y, label %bb4, label %bb5
+
+bb4:
+ %p4 = alloca %struct.S
+; CHECK: subl $1024, %esp
+ call void @f(%struct.S* %p4)
+ ret void
+
+bb5:
+ %p5 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+ call void @g(%struct.T* %p5)
+ ret void
+}
+
+
+declare void @f(%struct.S*)
+declare void @g(%struct.T*)
+declare void @h(%struct.U*)
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
diff --git a/test/CodeGen/X86/win-catchpad-csrs.ll b/test/CodeGen/X86/win-catchpad-csrs.ll
index 327ee45b4326..64c7a9747df9 100644
--- a/test/CodeGen/X86/win-catchpad-csrs.ll
+++ b/test/CodeGen/X86/win-catchpad-csrs.ll
@@ -51,7 +51,7 @@ handler1:
; X86: calll _getint
; X86: calll _useints
; X86: movl $0, -{{[0-9]+}}(%ebp)
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _f
; X86: [[contbb:LBB0_[0-9]+]]: # %try.cont
; X86: popl %esi
@@ -71,7 +71,7 @@ handler1:
; X86: subl $16, %esp
; X86: addl $12, %ebp
; X86: movl $1, -{{[0-9]+}}(%ebp)
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _f
; X86: movl $[[restorebb]], %eax
; X86-NEXT: addl $16, %esp
diff --git a/test/CodeGen/X86/win-catchpad-varargs.ll b/test/CodeGen/X86/win-catchpad-varargs.ll
index 6508f3bd7d64..a31b3d72c56c 100644
--- a/test/CodeGen/X86/win-catchpad-varargs.ll
+++ b/test/CodeGen/X86/win-catchpad-varargs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
+; RUN: llc -stack-symbol-ordering=0 -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
declare void @llvm.va_start(i8*)
declare void @llvm.va_end(i8*)
diff --git a/test/CodeGen/X86/win-catchpad.ll b/test/CodeGen/X86/win-catchpad.ll
index 836c53bda8e6..48866490c16c 100644
--- a/test/CodeGen/X86/win-catchpad.ll
+++ b/test/CodeGen/X86/win-catchpad.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
; Loosely based on IR for this C++ source code:
; void f(int p);
@@ -57,23 +57,23 @@ try.cont:
; X86: movl %esp, -[[sp_offset:[0-9]+]](%ebp)
; X86: movl $0, -{{[0-9]+}}(%ebp)
; X86: leal -[[local_offs:[0-9]+]](%ebp), %[[addr_reg:[a-z]+]]
-; X86-DAG: movl %[[addr_reg]], 4(%esp)
-; X86-DAG: movl $1, (%esp)
+; X86-DAG: pushl %[[addr_reg]]
+; X86-DAG: pushl $1
; X86: calll _f
; X86: [[contbb:LBB0_[0-9]+]]: # %try.cont
; X86: retl
-; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
-; X86-NEXT: # %handler1
-; X86-NEXT: addl $12, %ebp
-; X86: jmp [[contbb]]
-
; FIXME: These should be de-duplicated.
; X86: [[restorebb2:LBB0_[0-9]+]]: # Block address taken
; X86-NEXT: # %handler2
; X86-NEXT: addl $12, %ebp
; X86: jmp [[contbb]]
+; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT: # %handler1
+; X86-NEXT: addl $12, %ebp
+; X86: jmp [[contbb]]
+
; X86: "?catch$[[catch1bb:[0-9]+]]@?0?try_catch_catch@4HA":
; X86: LBB0_[[catch1bb]]: # %handler1{{$}}
; X86: pushl %ebp
@@ -83,13 +83,14 @@ try.cont:
; X86-DAG: movl -32(%ebp), %[[e_reg:[a-z]+]]
; X86-DAG: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
; X86-DAG: movl $1, -{{[0-9]+}}(%ebp)
-; X86-DAG: movl %[[addr_reg]], 4(%esp)
-; X86-DAG: movl %[[e_reg]], (%esp)
+; X86: pushl %[[addr_reg]]
+; X86: pushl %[[e_reg]]
; X86: calll _f
-; X86-NEXT: movl $[[restorebb1]], %eax
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86: addl $8, %esp
+; X86: movl $[[restorebb1]], %eax
+; X86: addl $8, %esp
+; X86: popl %ebp
+; X86: retl
; X86: "?catch$[[catch2bb:[0-9]+]]@?0?try_catch_catch@4HA":
; X86: LBB0_[[catch2bb]]: # %handler2{{$}}
@@ -99,13 +100,14 @@ try.cont:
; X86: movl %esp, -[[sp_offset]](%ebp)
; X86-DAG: leal -[[local_offs]](%ebp), %[[addr_reg:[a-z]+]]
; X86-DAG: movl $1, -{{[0-9]+}}(%ebp)
-; X86-DAG: movl %[[addr_reg]], 4(%esp)
-; X86-DAG: movl $3, (%esp)
+; X86: pushl %[[addr_reg]]
+; X86: pushl $3
; X86: calll _f
-; X86-NEXT: movl $[[restorebb2]], %eax
-; X86-NEXT: addl $8, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
+; X86: addl $8, %esp
+; X86: movl $[[restorebb2]], %eax
+; X86: addl $8, %esp
+; X86: popl %ebp
+; X86: retl
; X86: L__ehtable$try_catch_catch:
; X86: $handlerMap$0$try_catch_catch:
@@ -122,19 +124,19 @@ try.cont:
; X64: Lfunc_begin0:
; X64: pushq %rbp
; X64: .seh_pushreg 5
-; X64: subq $48, %rsp
-; X64: .seh_stackalloc 48
-; X64: leaq 48(%rsp), %rbp
-; X64: .seh_setframe 5, 48
+; X64: subq $[[STCK_ALLOC:.*]], %rsp
+; X64: .seh_stackalloc [[STCK_ALLOC]]
+; X64: leaq [[STCK_ALLOC]](%rsp), %rbp
+; X64: .seh_setframe 5, [[STCK_ALLOC]]
; X64: .seh_endprologue
-; X64: movq $-2, -8(%rbp)
+; X64: movq $-2, -16(%rbp)
; X64: .Ltmp0
; X64-DAG: leaq -[[local_offs:[0-9]+]](%rbp), %rdx
; X64-DAG: movl $1, %ecx
; X64: callq f
; X64: [[contbb:\.LBB0_[0-9]+]]: # Block address taken
; X64-NEXT: # %try.cont
-; X64: addq $48, %rsp
+; X64: addq $[[STCK_ALLOC]], %rsp
; X64: popq %rbp
; X64: retq
@@ -145,10 +147,10 @@ try.cont:
; X64: .seh_pushreg 5
; X64: subq $32, %rsp
; X64: .seh_stackalloc 32
-; X64: leaq 48(%rdx), %rbp
+; X64: leaq [[STCK_ALLOC]](%rdx), %rbp
; X64: .seh_endprologue
; X64-DAG: leaq -[[local_offs]](%rbp), %rdx
-; X64-DAG: movl -12(%rbp), %ecx
+; X64-DAG: movl -4(%rbp), %ecx
; X64: callq f
; X64: leaq [[contbb]](%rip), %rax
; X64-NEXT: addq $32, %rsp
@@ -162,7 +164,7 @@ try.cont:
; X64: .seh_pushreg 5
; X64: subq $32, %rsp
; X64: .seh_stackalloc 32
-; X64: leaq 48(%rdx), %rbp
+; X64: leaq [[STCK_ALLOC]](%rdx), %rbp
; X64: .seh_endprologue
; X64-DAG: leaq -[[local_offs]](%rbp), %rdx
; X64-DAG: movl $3, %ecx
@@ -180,7 +182,7 @@ try.cont:
; X64-NEXT: .long ($tryMap$try_catch_catch)@IMGREL
; X64-NEXT: .long 5
; X64-NEXT: .long ($ip2state$try_catch_catch)@IMGREL
-; X64-NEXT: .long 40
+; X64-NEXT: .long 48
; X64-NEXT: .long 0
; X64-NEXT: .long 1
@@ -194,7 +196,7 @@ try.cont:
; X64: $handlerMap$0$try_catch_catch:
; X64-NEXT: .long 0
; X64-NEXT: .long "??_R0H@8"@IMGREL
-; X64-NEXT: .long 36
+; X64-NEXT: .long 60
; X64-NEXT: .long "?catch$[[catch1bb]]@?0?try_catch_catch@4HA"@IMGREL
; X64-NEXT: .long 56
; X64-NEXT: .long 64
@@ -255,8 +257,8 @@ try.cont:
; X86: pushl %ebp
; X86: subl $8, %esp
; X86: addl $12, %ebp
-; X86: LBB1_[[loopbb:[0-9]+]]: # %loop
; X86: movl $1, -16(%ebp)
+; X86: LBB1_[[loopbb:[0-9]+]]: # %loop
; X86: calll _getbool
; X86: testb $1, %al
; X86: jne LBB1_[[loopbb]]
diff --git a/test/CodeGen/X86/win-cleanuppad.ll b/test/CodeGen/X86/win-cleanuppad.ll
index 4b0a543a876a..4b3af8c063bf 100644
--- a/test/CodeGen/X86/win-cleanuppad.ll
+++ b/test/CodeGen/X86/win-cleanuppad.ll
@@ -88,11 +88,11 @@ cleanup.outer: ; preds = %invoke.cont.1, %c
}
; X86-LABEL: _nested_cleanup:
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _f
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _f
-; X86: movl $3, (%esp)
+; X86: pushl $3
; X86: calll _f
; X86: "?dtor$[[cleanup_inner:[0-9]+]]@?0?nested_cleanup@4HA":
@@ -163,7 +163,7 @@ cleanup.outer: ; preds = %invoke.cont.1, %c
; X64: retq
; X64: .section .xdata,"dr"
-; X64-NEXT: .align 4
+; X64-NEXT: .p2align 2
; X64: $cppxdata$nested_cleanup:
; X64-NEXT: .long 429065506
; X64-NEXT: .long 2
diff --git a/test/CodeGen/X86/win32-eh-states.ll b/test/CodeGen/X86/win32-eh-states.ll
index 2777d6644e6a..634653dc2f97 100644
--- a/test/CodeGen/X86/win32-eh-states.ll
+++ b/test/CodeGen/X86/win32-eh-states.ll
@@ -68,19 +68,19 @@ catch.7:
; X86: movl $___ehhandler$f, {{.*}}
;
; X86: movl $0, [[state]](%ebp)
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _may_throw
;
; X86: movl $1, [[state]](%ebp)
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _may_throw
;
; X86: movl $2, [[state]](%ebp)
-; X86: movl $3, (%esp)
+; X86: pushl $3
; X86: calll _may_throw
;
; X86: movl $3, [[state]](%ebp)
-; X86: movl $4, (%esp)
+; X86: pushl $4
; X86: calll _may_throw
@@ -172,19 +172,19 @@ unreachable: ; preds = %entry
; X86: movl $___ehhandler$g, {{.*}}
;
; X86: movl $1, [[state]](%ebp)
-; X86: movl $-1, (%esp)
+; X86: pushl $-1
; X86: calll _may_throw
;
; X86: movl $2, [[state]](%ebp)
-; X86: movl $0, (%esp)
+; X86: pushl $0
; X86: calll _may_throw
;
; X86: movl $3, [[state]](%ebp)
-; X86: movl $1, (%esp)
+; X86: pushl $1
; X86: calll _may_throw
;
; X86: movl $2, [[state]](%ebp)
-; X86: movl $2, (%esp)
+; X86: pushl $2
; X86: calll _may_throw
; X64-LABEL: g:
diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll
index 73c7b486a55a..88403c687403 100644
--- a/test/CodeGen/X86/win32-eh.ll
+++ b/test/CodeGen/X86/win32-eh.ll
@@ -88,12 +88,58 @@ catch:
; CHECK-LABEL: L__ehtable$use_except_handler4:
; CHECK-NEXT: .long -2
; CHECK-NEXT: .long 0
-; CHECK-NEXT: .long 9999
+; CHECK-NEXT: .long -40
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long -2
; CHECK-NEXT: .long _catchall_filt
; CHECK-NEXT: .long LBB2_2
+define void @use_except_handler4_ssp() sspstrong personality i32 (...)* @_except_handler4 {
+entry:
+ invoke void @may_throw_or_crash()
+ to label %cont unwind label %lpad
+cont:
+ ret void
+lpad:
+ %cs = catchswitch within none [label %catch] unwind to caller
+catch:
+ %p = catchpad within %cs [i8* bitcast (i32 ()* @catchall_filt to i8*)]
+ catchret from %p to label %cont
+}
+
+; CHECK-LABEL: _use_except_handler4_ssp:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: subl ${{[0-9]+}}, %esp
+; CHECK: movl %ebp, %[[ehguard:[^ ,]*]]
+; CHECK: movl %esp, -36(%ebp)
+; CHECK: movl $-2, -16(%ebp)
+; CHECK: movl $L__ehtable$use_except_handler4_ssp, %[[lsda:[^ ,]*]]
+; CHECK: xorl ___security_cookie, %[[lsda]]
+; CHECK: movl %[[lsda]], -20(%ebp)
+; CHECK: xorl ___security_cookie, %[[ehguard]]
+; CHECK: movl %[[ehguard]], -40(%ebp)
+; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK: movl $__except_handler4, -24(%ebp)
+; CHECK: movl %fs:0, %[[next:[^ ,]*]]
+; CHECK: movl %[[next]], -28(%ebp)
+; CHECK: movl %[[node]], %fs:0
+; CHECK: calll _may_throw_or_crash
+; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
+; CHECK: movl %[[next]], %fs:0
+; CHECK: retl
+; CHECK: [[catch:[^ ,]*]]: # %catch{{$}}
+
+; CHECK: .section .xdata,"dr"
+; CHECK-LABEL: L__ehtable$use_except_handler4_ssp:
+; CHECK-NEXT: .long -2
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -40
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long -2
+; CHECK-NEXT: .long _catchall_filt
+; CHECK-NEXT: .long [[catch]]
+
define void @use_CxxFrameHandler3() personality i32 (...)* @__CxxFrameHandler3 {
invoke void @may_throw_or_crash()
to label %cont unwind label %catchall
@@ -125,7 +171,7 @@ catch:
; CHECK: retl
; CHECK: .section .xdata,"dr"
-; CHECK: .align 4
+; CHECK: .p2align 2
; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3:
; CHECK-NEXT: .long 429065506
; CHECK-NEXT: .long 2
diff --git a/test/CodeGen/X86/win32-seh-catchpad-realign.ll b/test/CodeGen/X86/win32-seh-catchpad-realign.ll
index 23aeea37c117..1ba0c1a0efe1 100644
--- a/test/CodeGen/X86/win32-seh-catchpad-realign.ll
+++ b/test/CodeGen/X86/win32-seh-catchpad-realign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s
; The aligned alloca means that we have to realign the stack, which forces the
; use of ESI to address local variables.
diff --git a/test/CodeGen/X86/win32-seh-catchpad.ll b/test/CodeGen/X86/win32-seh-catchpad.ll
index 224e96f8b8f0..995a4b9eaf2e 100644
--- a/test/CodeGen/X86/win32-seh-catchpad.ll
+++ b/test/CodeGen/X86/win32-seh-catchpad.ll
@@ -32,16 +32,16 @@ invoke.cont: ; preds = %entry
; CHECK-LABEL: _try_except:
; Store state #0
; CHECK: movl $0, -[[state:[0-9]+]](%ebp)
-; CHECK: movl $1, (%esp)
+; CHECK: pushl $1
; CHECK: calll _f
; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $3, (%esp)
+; CHECK: pushl $3
; CHECK: calll _f
; CHECK: retl
; __except
; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $2, (%esp)
+; CHECK: pushl $2
; CHECK: calll _f
; CHECK: .section .xdata,"dr"
@@ -205,7 +205,7 @@ __except:
; CHECK-NEXT: movl -24(%ebp), %esp
; CHECK-NEXT: addl $12, %ebp
; CHECK-NEXT: movl $-1, -16(%ebp)
-; CHECK-NEXT: movl $2, (%esp)
+; CHECK-NEXT: pushl $2
; CHECK-NEXT: calll _f
diff --git a/test/CodeGen/X86/win32-seh-nested-finally.ll b/test/CodeGen/X86/win32-seh-nested-finally.ll
index c283a35d70cf..b732815b8475 100644
--- a/test/CodeGen/X86/win32-seh-nested-finally.ll
+++ b/test/CodeGen/X86/win32-seh-nested-finally.ll
@@ -43,31 +43,35 @@ attributes #3 = { noinline }
; CHECK: movl $-1, -[[state:[0-9]+]](%ebp)
; CHECK: movl {{.*}}, %fs:0
; CHECK: movl $1, -[[state]](%ebp)
-; CHECK: movl $1, (%esp)
+; CHECK: pushl $1
; CHECK: calll _f
+; CHECK: addl $4, %esp
; CHECK: movl $0, -[[state]](%ebp)
-; CHECK: movl $2, (%esp)
+; CHECK: pushl $2
; CHECK: calll _f
+; CHECK: addl $4, %esp
; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $3, (%esp)
+; CHECK: pushl $3
; CHECK: calll _f
+; CHECK: addl $4, %esp
; CHECK: retl
; CHECK: LBB0_[[inner:[0-9]+]]: # %ehcleanup
; CHECK: pushl %ebp
; CHECK: addl $12, %ebp
-; CHECK: movl $0, -[[state]](%ebp)
-; CHECK: movl $2, (%esp)
+; CHECK: pushl $2
; CHECK: calll _f
+; CHECK: addl $4, %esp
+; CHECK: addl $4, %esp
; CHECK: popl %ebp
; CHECK: retl
; CHECK: LBB0_[[outer:[0-9]+]]: # %ehcleanup.3
; CHECK: pushl %ebp
; CHECK: addl $12, %ebp
-; CHECK: movl $-1, -[[state]](%ebp)
-; CHECK: movl $3, (%esp)
+; CHECK: pushl $3
; CHECK: calll _f
+; CHECK: addl $8, %esp
; CHECK: popl %ebp
; CHECK: retl
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index b38273ad9594..56008e15910e 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -135,12 +135,11 @@ entry:
; Load the address of the result and put it onto stack
-; (through %ecx in the -O0 build).
-; WIN32: leal {{[0-9]+}}(%esp), %e{{[a-d]}}x
-; WIN32: movl %e{{[a-d]}}x, (%e{{([a-d]x)|(sp)}})
-
; The this pointer goes to ECX.
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; (through %ecx in the -O0 build).
+; WIN32: leal {{[0-9]*}}(%esp), %e{{[a-d]}}x
+; WIN32: leal {{[0-9]*}}(%esp), %ecx
+; WIN32: {{pushl %e[a-d]x|movl %e[a-d]x, \(%esp\)}}
; WIN32-NEXT: calll "?foo@C5@@QAE?AUS5@@XZ"
; WIN32: retl
ret void
@@ -155,25 +154,21 @@ define void @test6_f(%struct.test6* %x) nounwind {
; LINUX-LABEL: test6_f:
; The %x argument is moved to %ecx. It will be the this pointer.
-; WIN32: movl 20(%esp), %ecx
-
-; The %x argument is moved to (%esp). It will be the this pointer. With -O0
-; we copy esp to ecx and use (ecx) instead of (esp).
-; MINGW_X86: movl 20(%esp), %eax
-; MINGW_X86: movl %eax, (%e{{([a-d]x)|(sp)}})
+; WIN32: movl {{16|20}}(%esp), %ecx
-; CYGWIN: movl 20(%esp), %eax
-; CYGWIN: movl %eax, (%e{{([a-d]x)|(sp)}})
; The sret pointer is (%esp)
-; WIN32: leal 4(%esp), %[[REG:e[a-d]x]]
-; WIN32-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
+; WIN32: leal {{4?}}(%esp), %eax
+; WIN32-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
; The sret pointer is %ecx
-; MINGW_X86-NEXT: leal 4(%esp), %ecx
+; The %x argument is moved to (%esp). It will be the this pointer.
+; MINGW_X86: leal {{4?}}(%esp), %ecx
+; MINGW_X86-NEXT: {{pushl 16\(%esp\)|movl %eax, \(%esp\)}}
; MINGW_X86-NEXT: calll _test6_g
-; CYGWIN-NEXT: leal 4(%esp), %ecx
+; CYGWIN: leal {{4?}}(%esp), %ecx
+; CYGWIN-NEXT: {{pushl 16\(%esp\)|movl %eax, \(%esp\)}}
; CYGWIN-NEXT: calll _test6_g
%tmp = alloca %struct.test6, align 4
@@ -191,17 +186,17 @@ define void @test7_f(%struct.test7* %x) nounwind {
; LINUX-LABEL: test7_f:
; The %x argument is moved to %ecx on all OSs. It will be the this pointer.
-; WIN32: movl 20(%esp), %ecx
-; MINGW_X86: movl 20(%esp), %ecx
-; CYGWIN: movl 20(%esp), %ecx
+; WIN32: movl {{16|20}}(%esp), %ecx
+; MINGW_X86: movl {{16|20}}(%esp), %ecx
+; CYGWIN: movl {{16|20}}(%esp), %ecx
; The sret pointer is (%esp)
-; WIN32: leal 4(%esp), %[[REG:e[a-d]x]]
-; WIN32-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
-; MINGW_X86: leal 4(%esp), %[[REG:e[a-d]x]]
-; MINGW_X86-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
-; CYGWIN: leal 4(%esp), %[[REG:e[a-d]x]]
-; CYGWIN-NEXT: movl %[[REG]], (%e{{([a-d]x)|(sp)}})
+; WIN32: leal {{4?}}(%esp), %eax
+; WIN32-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
+; MINGW_X86: leal {{4?}}(%esp), %eax
+; MINGW_X86-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
+; CYGWIN: leal {{4?}}(%esp), %eax
+; CYGWIN-NEXT: {{pushl %eax|movl %eax, \(%esp\)}}
%tmp = alloca %struct.test7, align 4
call x86_thiscallcc void @test7_g(%struct.test7* %x, %struct.test7* sret %tmp)
diff --git a/test/CodeGen/X86/win64_eh.ll b/test/CodeGen/X86/win64_eh.ll
index cb9d026bec2d..9421f00c8107 100644
--- a/test/CodeGen/X86/win64_eh.ll
+++ b/test/CodeGen/X86/win64_eh.ll
@@ -47,7 +47,6 @@ entry:
; WIN64: .seh_endproc
-; Checks stack push
define i32 @foo3(i32 %f_arg, i32 %e_arg, i32 %d_arg, i32 %c_arg, i32 %b_arg, i32 %a_arg) uwtable {
entry:
%a = alloca i32
@@ -83,14 +82,11 @@ entry:
}
; WIN64-LABEL: foo3:
; WIN64: .seh_proc foo3
-; WIN64: pushq %rsi
-; WIN64: .seh_pushreg 6
; NORM: subq $24, %rsp
; ATOM: leaq -24(%rsp), %rsp
; WIN64: .seh_stackalloc 24
; WIN64: .seh_endprologue
; WIN64: addq $24, %rsp
-; WIN64: popq %rsi
; WIN64: ret
; WIN64: .seh_endproc
diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll
index 77c37b4d348e..a674d8c080af 100644
--- a/test/CodeGen/X86/win_cst_pool.ll
+++ b/test/CodeGen/X86/win_cst_pool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=sse2 -mattr=avx | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc"
@@ -7,7 +7,7 @@ define double @double() {
}
; CHECK: .globl __real@0000000000800000
; CHECK-NEXT: .section .rdata,"dr",discard,__real@0000000000800000
-; CHECK-NEXT: .align 8
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: __real@0000000000800000:
; CHECK-NEXT: .quad 8388608
; CHECK: double:
@@ -19,7 +19,7 @@ define <4 x i32> @vec1() {
}
; CHECK: .globl __xmm@00000000000000010000000200000003
; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000000000000010000000200000003
-; CHECK-NEXT: .align 16
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: __xmm@00000000000000010000000200000003:
; CHECK-NEXT: .long 3
; CHECK-NEXT: .long 2
@@ -34,7 +34,7 @@ define <8 x i16> @vec2() {
}
; CHECK: .globl __xmm@00000001000200030004000500060007
; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000001000200030004000500060007
-; CHECK-NEXT: .align 16
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: __xmm@00000001000200030004000500060007:
; CHECK-NEXT: .short 7
; CHECK-NEXT: .short 6
@@ -54,7 +54,7 @@ define <4 x float> @undef1() {
; CHECK: .globl __xmm@00000000000000003f8000003f800000
; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000000000000003f8000003f800000
-; CHECK-NEXT: .align 16
+; CHECK-NEXT: .p2align 4
; CHECK-NEXT: __xmm@00000000000000003f8000003f800000:
; CHECK-NEXT: .long 1065353216 # float 1
; CHECK-NEXT: .long 1065353216 # float 1
@@ -73,7 +73,21 @@ define float @pr23966(i32 %a) {
; CHECK: .globl __real@bf8000003f800000
; CHECK-NEXT: .section .rdata,"dr",discard,__real@bf8000003f800000
-; CHECK-NEXT: .align 4
+; CHECK-NEXT: .p2align 3
; CHECK-NEXT: __real@bf8000003f800000:
; CHECK-NEXT: .long 1065353216
; CHECK-NEXT: .long 3212836864
+
+define <4 x i64> @ymm() {
+entry:
+ ret <4 x i64> <i64 8589934593, i64 17179869187, i64 8589934593, i64 17179869187>
+}
+
+; CHECK: .globl __ymm@0000000400000003000000020000000100000004000000030000000200000001
+; CHECK: .section .rdata,"dr",discard,__ymm@0000000400000003000000020000000100000004000000030000000200000001
+; CHECK: .p2align 5
+; CHECK: __ymm@0000000400000003000000020000000100000004000000030000000200000001:
+; CHECK: .quad 8589934593 # 0x200000001
+; CHECK: .quad 17179869187 # 0x400000003
+; CHECK: .quad 8589934593 # 0x200000001
+; CHECK: .quad 17179869187
diff --git a/test/CodeGen/X86/x86-16.ll b/test/CodeGen/X86/x86-16.ll
new file mode 100644
index 000000000000..775b2c447bbd
--- /dev/null
+++ b/test/CodeGen/X86/x86-16.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-code16"
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval, align 4
+ ret i32 0
+}
+
+; CHECK: .code16
+; CHECK-LABEL: main
+
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 (trunk 265439) (llvm/trunk 265567)"} \ No newline at end of file
diff --git a/test/CodeGen/X86/x86-32-intrcc.ll b/test/CodeGen/X86/x86-32-intrcc.ll
index 99d0044c6de6..9794f2cb3e46 100644
--- a/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/test/CodeGen/X86/x86-32-intrcc.ll
@@ -3,7 +3,7 @@
%struct.interrupt_frame = type { i32, i32, i32, i32, i32 }
-@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+@llvm.used = appending global [4 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i32)* @test_isr_clobbers to i8*), i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_x87 to i8*)], section "llvm.metadata"
; Spills eax, putting original esp at +4.
; No stack adjustment if declared with no error code
@@ -77,3 +77,19 @@ define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i32 %
ret void
}
+@f80 = common global x86_fp80 0xK00000000000000000000, align 4
+
+; Test that the presence of x87 does not crash the FP stackifier
+define x86_intrcc void @test_isr_x87(%struct.interrupt_frame* %frame) {
+ ; CHECK-LABEL: test_isr_x87
+ ; CHECK-DAG: fldt f80
+ ; CHECK-DAG: fld1
+ ; CHECK: faddp
+ ; CHECK-NEXT: fstpt f80
+ ; CHECK-NEXT: iretl
+entry:
+ %ld = load x86_fp80, x86_fp80* @f80, align 4
+ %add = fadd x86_fp80 %ld, 0xK3FFF8000000000000000
+ store x86_fp80 %add, x86_fp80* @f80, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/x86-32-vector-calling-conv.ll b/test/CodeGen/X86/x86-32-vector-calling-conv.ll
index b2bda7ab8d01..e87f2b065d3a 100644
--- a/test/CodeGen/X86/x86-32-vector-calling-conv.ll
+++ b/test/CodeGen/X86/x86-32-vector-calling-conv.ll
@@ -2,11 +2,11 @@
; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=LINUX
; CHECK-LABEL: test_sse:
-; DARWIN-DAG: vpaddd %xmm1, %xmm0, %xmm0
-; DARWIN-DAG: vpaddd %xmm3, %xmm2, %xmm1
+; DARWIN: vpaddd %xmm3, %xmm2, %xmm2
+; DARWIN: vpaddd %xmm2, %xmm1, %xmm1
; DARWIN: vpaddd %xmm1, %xmm0, %xmm0
-; LINUX-DAG: vpaddd %xmm1, %xmm0, %xmm0
-; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm1
+; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm2
+; LINUX: vpaddd %xmm2, %xmm1, %xmm1
; LINUX: vpaddd %xmm1, %xmm0, %xmm0
define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
%r0 = add <4 x i32> %a, %b
@@ -16,11 +16,11 @@ define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %
}
; CHECK-LABEL: test_avx:
-; DARWIN-DAG: vpaddd %ymm1, %ymm0, %ymm0
-; DARWIN-DAG: vpaddd %ymm3, %ymm2, %ymm1
+; DARWIN: vpaddd %ymm3, %ymm2, %ymm2
+; DARWIN: vpaddd %ymm2, %ymm1, %ymm1
; DARWIN: vpaddd %ymm1, %ymm0, %ymm0
-; LINUX-DAG: vpaddd %ymm1, %ymm0, %ymm0
-; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm1
+; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm2
+; LINUX: vpaddd %ymm2, %ymm1, %ymm1
; LINUX: vpaddd %ymm1, %ymm0, %ymm0
define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind {
%r0 = add <8 x i32> %a, %b
@@ -30,11 +30,11 @@ define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %
}
; CHECK-LABEL: test_avx512:
-; DARWIN-DAG: vpaddd %zmm1, %zmm0, %zmm0
-; DARWIN-DAG: vpaddd %zmm3, %zmm2, %zmm1
+; DARWIN: vpaddd %zmm3, %zmm2, %zmm2
+; DARWIN: vpaddd %zmm2, %zmm1, %zmm1
; DARWIN: vpaddd %zmm1, %zmm0, %zmm0
-; LINUX-DAG: vpaddd %zmm1, %zmm0, %zmm0
-; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm1
+; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm2
+; LINUX: vpaddd %zmm2, %zmm1, %zmm1
; LINUX: vpaddd %zmm1, %zmm0, %zmm0
define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind {
%r0 = add <16 x i32> %a, %b
diff --git a/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/test/CodeGen/X86/x86-64-flags-intrinsics.ll
index 4c5032aedbca..2852ef49e0a5 100644
--- a/test/CodeGen/X86/x86-64-flags-intrinsics.ll
+++ b/test/CodeGen/X86/x86-64-flags-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
target triple = "x86_64-pc-win32"
declare i64 @llvm.x86.flags.read.u64()
diff --git a/test/CodeGen/X86/x86-64-intrcc.ll b/test/CodeGen/X86/x86-64-intrcc.ll
index 429209c063ca..2bcf3cde478a 100644
--- a/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/test/CodeGen/X86/x86-64-intrcc.ll
@@ -3,7 +3,7 @@
%struct.interrupt_frame = type { i64, i64, i64, i64, i64 }
-@llvm.used = appending global [3 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_clobbers to i8*)], section "llvm.metadata"
+@llvm.used = appending global [4 x i8*] [i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_no_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_ecode to i8*), i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_clobbers to i8*), i8* bitcast (void (%struct.interrupt_frame*)* @test_isr_x87 to i8*)], section "llvm.metadata"
; Spills rax, putting original esp at +8.
; No stack adjustment if declared with no error code
@@ -83,4 +83,21 @@ define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %
; CHECK0-SSE-NEXT: addq $8, %rsp
; CHECK0-SSE-NEXT: iretq
ret void
-} \ No newline at end of file
+}
+
+@f80 = common global x86_fp80 0xK00000000000000000000, align 4
+
+; Test that the presence of x87 does not crash the FP stackifier
+define x86_intrcc void @test_isr_x87(%struct.interrupt_frame* %frame) {
+ ; CHECK-LABEL: test_isr_x87
+ ; CHECK-DAG: fldt f80
+ ; CHECK-DAG: fld1
+ ; CHECK: faddp
+ ; CHECK-NEXT: fstpt f80
+ ; CHECK-NEXT: iretq
+entry:
+ %ld = load x86_fp80, x86_fp80* @f80, align 4
+ %add = fadd x86_fp80 %ld, 0xK3FFF8000000000000000
+ store x86_fp80 %add, x86_fp80* @f80, align 4
+ ret void
+}
diff --git a/test/CodeGen/X86/x86-64-pic.ll b/test/CodeGen/X86/x86-64-pic.ll
new file mode 100644
index 000000000000..76ed8894b417
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-pic.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic | FileCheck %s
+
+@g1 = private global i8 1
+define i8* @get_g1() {
+; CHECK: get_g1:
+; CHECK: leaq .Lg1(%rip), %rax
+ ret i8* @g1
+}
diff --git a/test/CodeGen/X86/x86-64-plt-relative-reloc.ll b/test/CodeGen/X86/x86-64-plt-relative-reloc.ll
new file mode 100644
index 000000000000..8ba480d1e1d6
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-plt-relative-reloc.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=x86_64-unknown-linux -o - %s | FileCheck %s
+
+@vtable = constant [5 x i32] [i32 0,
+ i32 trunc (i64 sub (i64 ptrtoint (void ()* @fn1 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32),
+ i32 trunc (i64 sub (i64 ptrtoint (void ()* @fn2 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32),
+ i32 trunc (i64 sub (i64 ptrtoint (void ()* @fn3 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32),
+ i32 trunc (i64 sub (i64 ptrtoint (i8* @global4 to i64), i64 ptrtoint (i32* getelementptr ([5 x i32], [5 x i32]* @vtable, i32 0, i32 1) to i64)) to i32)
+]
+
+declare void @fn1() unnamed_addr
+declare void @fn2() unnamed_addr
+declare void @fn3()
+@global4 = external unnamed_addr global i8
+
+; CHECK: .long 0
+; CHECK-NEXT: .long (fn1@PLT-vtable)-4
+; CHECK-NEXT: .long (fn2@PLT-vtable)-4
+; CHECK-NEXT: .long (fn3-vtable)-4
+; CHECK-NEXT: .long (global4-vtable)-4
diff --git a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
index c476ffd84053..b1f4ca562236 100644
--- a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
+++ b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
-; RUN: llc -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
; x32 uses %esp, %ebp as stack and frame pointers
diff --git a/test/CodeGen/X86/x86-big-ret.ll b/test/CodeGen/X86/x86-big-ret.ll
new file mode 100644
index 000000000000..b7fed33f396b
--- /dev/null
+++ b/test/CodeGen/X86/x86-big-ret.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc"
+
+define x86_fastcallcc i32 @test1(i32 inreg %V, [65533 x i8]* byval %p_arg) {
+ ret i32 %V
+}
+; CHECK-LABEL: @test1@65540:
+; CHECK: movl %ecx, %eax
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: addl $65536, %esp
+; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: retl
+
+define x86_stdcallcc void @test2([65533 x i8]* byval %p_arg) {
+ ret void
+}
+; CHECK-LABEL: _test2@65536:
+; CHECK: popl %ecx
+; CHECK-NEXT: addl $65536, %esp
+; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/x86-flags-intrinsics.ll b/test/CodeGen/X86/x86-flags-intrinsics.ll
index 325de7d5f1e7..e2233aec22c7 100644
--- a/test/CodeGen/X86/x86-flags-intrinsics.ll
+++ b/test/CodeGen/X86/x86-flags-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
target triple = "i686-pc-win32"
declare i32 @llvm.x86.flags.read.u32()
diff --git a/test/CodeGen/X86/x86-interrupt_cc.ll b/test/CodeGen/X86/x86-interrupt_cc.ll
new file mode 100644
index 000000000000..b91b8fbfb76d
--- /dev/null
+++ b/test/CodeGen/X86/x86-interrupt_cc.ll
@@ -0,0 +1,33 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx -show-mc-encoding -mattr=+avx512f < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK64
+; RUN: llc -verify-machineinstrs -mtriple=i386-apple-macosx -show-mc-encoding -mattr=+avx512f < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK32
+
+; Make sure we spill the high numbered ZMM registers and K registers with the right encoding.
+; CHECK-LABEL: foo
+; CHECK: kmovq %k7, {{.+}}
+; CHECK64: encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x08,0x00,0x00]
+; CHECK32: encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x02,0x00,0x00]
+; k6 is used as an anchor for the previous regexp.
+; CHECK-NEXT: kmovq %k6
+
+; CHECK64: movups %zmm31, {{.+}}
+; CHECK64: encoding: [0x62,0x61,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x07,0x00,0x00]
+; zmm30 is used as an anchor for the previous regexp.
+; CHECK64-NEXT: movups %zmm30
+
+; CHECK32-NOT: zmm31
+; CHECK32-NOT: zmm8
+; CHECK32: movups %zmm7, {{.+}}
+; CHECK32: encoding: [0x62,0xf1,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x01,0x00,0x00]
+; zmm6 is used as an anchor for the previous regexp.
+; CHECK32-NEXT: movups %zmm6
+
+; CHECK: call
+; CHECK: iret
+
+define x86_intrcc void @foo(i8* %frame) {
+ call void @bar()
+ ret void
+}
+
+declare void @bar()
+
diff --git a/test/CodeGen/X86/x86-interrupt_cld.ll b/test/CodeGen/X86/x86-interrupt_cld.ll
new file mode 100644
index 000000000000..bbb109eb633e
--- /dev/null
+++ b/test/CodeGen/X86/x86-interrupt_cld.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Checks that interrupt handler code calls cld before calling an external
+;; function.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK: cld
+; CHECK: call
+
+define x86_intrcc void @foo(i8* %frame) {
+ call void @bar()
+ ret void
+}
+
+declare void @bar()
+
diff --git a/test/CodeGen/X86/x86-interrupt_vzeroupper.ll b/test/CodeGen/X86/x86-interrupt_vzeroupper.ll
new file mode 100644
index 000000000000..b735ae82bd52
--- /dev/null
+++ b/test/CodeGen/X86/x86-interrupt_vzeroupper.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Checks that interrupt handler code does not call "vzeroupper" instruction
+;; before iret.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK: vzeroupper
+; CHECK-NEXT: call
+; CHECK-NOT: vzeroupper
+; CHECK: iret
+
+define x86_intrcc void @foo(i8* %frame) {
+ call void @bar()
+ ret void
+}
+
+declare void @bar()
+
diff --git a/test/CodeGen/X86/x86-plt-relative-reloc.ll b/test/CodeGen/X86/x86-plt-relative-reloc.ll
new file mode 100644
index 000000000000..733a4cb5f034
--- /dev/null
+++ b/test/CodeGen/X86/x86-plt-relative-reloc.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=i686-unknown-linux -o - %s | FileCheck %s
+
+@vtable = constant [4 x i32] [i32 0,
+ i32 sub (i32 ptrtoint (void ()* @fn1 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32)),
+ i32 sub (i32 ptrtoint (void ()* @fn2 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32)),
+ i32 sub (i32 ptrtoint (void ()* @fn3 to i32), i32 ptrtoint (i32* getelementptr ([4 x i32], [4 x i32]* @vtable, i32 0, i32 1) to i32))
+]
+
+declare void @fn1() unnamed_addr
+declare void @fn2() unnamed_addr
+declare void @fn3()
+
+; CHECK: .long 0
+; CHECK-NEXT: .long (fn1@PLT-vtable)-4
+; CHECK-NEXT: .long (fn2@PLT-vtable)-4
+; CHECK-NEXT: .long (fn3-vtable)-4
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index eb87f7101d7c..2899e38b71cd 100644
--- a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s --check-prefix=CHECK
+; RUN: llc %s -o - | FileCheck %s
;
; Note: This test cannot be merged with the shrink-wrapping tests
; because the booleans set on the command line take precedence on
@@ -185,7 +185,7 @@ attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
; CHECK-NEXT: je [[STRINGS_EQUAL]]
;
; CHECK: [[STRINGS_EQUAL]]
-; CHECK-NEXT: popq
+; CHECK: popq
define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 {
entry:
%cmp.i = icmp eq i8* %vk1, null
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 609e2cc1158c..5b6e773fe5d4 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -811,8 +811,6 @@ end:
;
; Load the value of b.
; CHECK: movb _b(%rip), [[BOOL:%cl]]
-; Extract i1 from the loaded value.
-; CHECK-NEXT: andb $1, [[BOOL]]
; Create the zero value for the select assignment.
; CHECK-NEXT: xorl [[CMOVE_VAL:%eax]], [[CMOVE_VAL]]
; CHECK-NEXT: testb [[BOOL]], [[BOOL]]
diff --git a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
index d885f1cd364f..d3a12862a9e4 100644
--- a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
+++ b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
@@ -1,41 +1,44 @@
-; RUN: llc -mattr=+avx < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mattr=+avx | FileCheck %s
; Check that we properly upgrade the AVX vbroadcast intrinsics to IR. The
; expectation is that we should still get the original instruction back that
; maps to the intrinsic.
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.9.0"
-; CHECK-LABEL: test_mm_broadcast_ss:
define <4 x float> @test_mm_broadcast_ss(float* readonly %__a){
+; CHECK-LABEL: test_mm_broadcast_ss:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT: retq
entry:
%0 = bitcast float* %__a to i8*
-; CHECK: vbroadcastss (%{{.*}}), %xmm
%1 = tail call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %0)
ret <4 x float> %1
}
+declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*)
-; CHECK-LABEL: test_mm256_broadcast_sd:
define <4 x double> @test_mm256_broadcast_sd(double* readonly %__a) {
+; CHECK-LABEL: test_mm256_broadcast_sd:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT: retq
entry:
%0 = bitcast double* %__a to i8*
-; CHECK: vbroadcastsd (%{{.*}}), %ymm
%1 = tail call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %0)
ret <4 x double> %1
}
+declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*)
-; CHECK-LABEL: test_mm256_broadcast_ss:
define <8 x float> @test_mm256_broadcast_ss(float* readonly %__a) {
+; CHECK-LABEL: test_mm256_broadcast_ss:
+; CHECK: ## BB#0: ## %entry
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT: retq
entry:
%0 = bitcast float* %__a to i8*
-; CHECK: vbroadcastss (%{{.*}}), %ymm
%1 = tail call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %0)
ret <8 x float> %1
}
-
-declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*)
-
-declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*)
-
declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*)
diff --git a/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
index d4813ea47a3d..8e081b9e4100 100644
--- a/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
@@ -1,13 +1,17 @@
-; RUN: llc -mattr=+avx2 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.10.0 -mattr=+avx2 | FileCheck %s
; Check that we properly upgrade the AVX2 vbroadcast intrinsic to IR.
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
define <4 x i64> @broadcast128(<2 x i64> %src) {
- ; CHECK-LABEL: broadcast128
- ; CHECK: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-LABEL: broadcast128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = alloca <2 x i64>, align 16
%2 = bitcast <2 x i64>* %1 to i8*
store <2 x i64> %src, <2 x i64>* %1, align 16
diff --git a/test/CodeGen/X86/x87.ll b/test/CodeGen/X86/x87.ll
new file mode 100644
index 000000000000..683d7b05cf8c
--- /dev/null
+++ b/test/CodeGen/X86/x87.ll
@@ -0,0 +1,55 @@
+; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X87
+; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=X87
+; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+
+define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone {
+; X87-LABEL: test:
+; NOX87-LABEL: test:
+; X87: fild
+; NOX87: __floatunsisf
+ %tmp = uitofp i32 %i to float
+
+; X87: fild
+; NOX87: __floatdisf
+ %tmp1 = sitofp i64 %l to float
+
+; X87: fadd
+; NOX87: __addsf3
+ %tmp2 = fadd float %tmp, %tmp1
+
+; X87: fstp
+ store float %tmp2, float* %pf
+
+; X87: fild
+; NOX87: __floatunsidf
+ %tmp3 = uitofp i32 %i to double
+
+; X87: fild
+; NOX87: __floatdidf
+ %tmp4 = sitofp i64 %l to double
+
+; X87: fadd
+; NOX87: __adddf3
+ %tmp5 = fadd double %tmp3, %tmp4
+
+; X87: fstp
+ store double %tmp5, double* %pd
+
+; X87: __floatsitf
+; NOX87: __floatsitf
+ %tmp6 = sitofp i32 %i to fp128
+
+; X87: __floatunditf
+; NOX87: __floatunditf
+ %tmp7 = uitofp i64 %l to fp128
+
+; X87: __addtf3
+; NOX87: __addtf3
+ %tmp8 = fadd fp128 %tmp6, %tmp7
+ store fp128 %tmp8, fp128* %pld
+
+ ret void
+}
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index 7c4b60d264c9..eb0fd8649868 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
-
+; RUN: llc -mtriple=x86_64-darwin-unknown -mcpu=knl < %s | FileCheck %s --check-prefix=KNL
;
; Get the actual value of the overflow bit.
;
@@ -295,7 +295,7 @@ entry:
define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) {
entry:
; CHECK-LABEL: smulo.i8
-; CHECK: movb %dil, %al
+; CHECK: movl %edi, %eax
; CHECK-NEXT: imulb %sil
; CHECK-NEXT: seto %cl
%t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -345,7 +345,7 @@ entry:
define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) {
entry:
; CHECK-LABEL: umulo.i8
-; CHECK: movb %dil, %al
+; CHECK: movl %edi, %eax
; CHECK-NEXT: mulb %sil
; CHECK-NEXT: seto %cl
%t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
@@ -734,6 +734,26 @@ continue:
ret i1 true
}
+define i1 @bug27873(i64 %c1, i1 %c2) {
+; KNL-LABEL: bug27873:
+; KNL: ## BB#0:
+; KNL-NEXT: andl $1, %esi
+; KNL-NEXT: kmovw %esi, %k0
+; KNL-NEXT: movl $160, %ecx
+; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: mulq %rcx
+; KNL-NEXT: seto %al
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: korw %k1, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: # kill
+; KNL-NEXT: retq
+ %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
+ %mul.overflow = extractvalue { i64, i1 } %mul, 1
+ %x1 = or i1 %c2, %mul.overflow
+ ret i1 %x1
+}
+
declare {i8, i1} @llvm.sadd.with.overflow.i8 (i8, i8 ) nounwind readnone
declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
index 825efa6361b5..76a7e72ca961 100644
--- a/test/CodeGen/X86/xmulo.ll
+++ b/test/CodeGen/X86/xmulo.ll
@@ -9,9 +9,9 @@ declare i32 @printf(i8*, ...)
define i32 @t1() nounwind {
; CHECK-LABEL: t1:
-; CHECK: movl $0, 12(%esp)
-; CHECK: movl $0, 8(%esp)
-; CHECK: movl $72, 4(%esp)
+; CHECK: pushl $0
+; CHECK: pushl $0
+; CHECK: pushl $72
%1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
%2 = extractvalue {i64, i1} %1, 0
@@ -23,9 +23,9 @@ define i32 @t1() nounwind {
define i32 @t2() nounwind {
; CHECK-LABEL: t2:
-; CHECK: movl $0, 12(%esp)
-; CHECK: movl $0, 8(%esp)
-; CHECK: movl $0, 4(%esp)
+; CHECK: pushl $0
+; CHECK: pushl $0
+; CHECK: pushl $0
%1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
%2 = extractvalue {i64, i1} %1, 0
@@ -37,9 +37,9 @@ define i32 @t2() nounwind {
define i32 @t3() nounwind {
; CHECK-LABEL: t3:
-; CHECK: movl $1, 12(%esp)
-; CHECK: movl $-1, 8(%esp)
-; CHECK: movl $-9, 4(%esp)
+; CHECK: pushl $1
+; CHECK: pushl $-1
+; CHECK: pushl $-9
%1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
%2 = extractvalue {i64, i1} %1, 0
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..a9287e7d8c91
--- /dev/null
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -0,0 +1,1111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/xop-builtins.c
+
+define <2 x i64> @test_mm_maccs_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccs_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccs_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_macc_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macc_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macc_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %arg0, <8 x i16> %arg1, <8 x i16> %arg2)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_maccsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccsd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccsd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maccd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maccs_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccs_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccs_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_macc_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macc_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macc_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maccslo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccslo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccslo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_macclo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macclo_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macclo_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_maccshi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maccshi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maccshi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_macchi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_macchi_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_macchi_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %arg0, <4 x i32> %arg1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_maddsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maddsd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maddsd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_maddd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
+; X32-LABEL: test_mm_maddd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maddd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %arg0, <8 x i16> %arg1, <4 x i32> %arg2)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_haddw_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddw_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphaddbw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddw_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphaddbw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphaddbd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphaddbd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphaddbq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphaddbq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphaddwq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphaddwq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphadddq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphadddq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_haddw_epu8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddw_epu8:
+; X32: # BB#0:
+; X32-NEXT: vphaddubw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddw_epu8:
+; X64: # BB#0:
+; X64-NEXT: vphaddubw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epu8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epu8:
+; X32: # BB#0:
+; X32-NEXT: vphaddubd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epu8:
+; X64: # BB#0:
+; X64-NEXT: vphaddubd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epu8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epu8:
+; X32: # BB#0:
+; X32-NEXT: vphaddubq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epu8:
+; X64: # BB#0:
+; X64-NEXT: vphaddubq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_haddd_epu16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddd_epu16:
+; X32: # BB#0:
+; X32-NEXT: vphadduwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddd_epu16:
+; X64: # BB#0:
+; X64-NEXT: vphadduwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_mm_haddq_epu16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epu16:
+; X32: # BB#0:
+; X32-NEXT: vphadduwq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epu16:
+; X64: # BB#0:
+; X64-NEXT: vphadduwq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_haddq_epu32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_haddq_epu32:
+; X32: # BB#0:
+; X32-NEXT: vphaddudq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_haddq_epu32:
+; X64: # BB#0:
+; X64-NEXT: vphaddudq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubw_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_hsubw_epi8:
+; X32: # BB#0:
+; X32-NEXT: vphsubbw %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_hsubw_epi8:
+; X64: # BB#0:
+; X64-NEXT: vphsubbw %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %arg0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubd_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_hsubd_epi16:
+; X32: # BB#0:
+; X32-NEXT: vphsubwd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_hsubd_epi16:
+; X64: # BB#0:
+; X64-NEXT: vphsubwd %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %arg0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_hsubq_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_hsubq_epi32:
+; X32: # BB#0:
+; X32-NEXT: vphsubdq %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_hsubq_epi32:
+; X64: # BB#0:
+; X64-NEXT: vphsubdq %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %arg0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_cmov_si128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_cmov_si128:
+; X32: # BB#0:
+; X32-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; X32-NEXT: vpxor %xmm3, %xmm2, %xmm3
+; X32-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X32-NEXT: vpand %xmm3, %xmm1, %xmm1
+; X32-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_cmov_si128:
+; X64: # BB#0:
+; X64-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; X64-NEXT: vpxor %xmm3, %xmm2, %xmm3
+; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpand %xmm3, %xmm1, %xmm1
+; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_cmov_si256:
+; X32: # BB#0:
+; X32-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_cmov_si256:
+; X64: # BB#0:
+; X64-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_perm_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_perm_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_perm_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi8:
+; X32: # BB#0:
+; X32-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi8:
+; X64: # BB#0:
+; X64-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi16:
+; X32: # BB#0:
+; X32-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi16:
+; X64: # BB#0:
+; X64-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi32:
+; X32: # BB#0:
+; X32-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi32:
+; X64: # BB#0:
+; X64-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_rot_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_rot_epi64:
+; X32: # BB#0:
+; X32-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_rot_epi64:
+; X64: # BB#0:
+; X64-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi8(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi8:
+; X32: # BB#0:
+; X32-NEXT: vprotb $1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi8:
+; X64: # BB#0:
+; X64-NEXT: vprotb $1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %arg0, i8 1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi16(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi16:
+; X32: # BB#0:
+; X32-NEXT: vprotw $50, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi16:
+; X64: # BB#0:
+; X64-NEXT: vprotw $50, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %arg0, i8 50)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi32(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi32:
+; X32: # BB#0:
+; X32-NEXT: vprotd $226, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi32:
+; X64: # BB#0:
+; X64-NEXT: vprotd $226, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %arg0, i8 -30)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_roti_epi64(<2 x i64> %a0) {
+; X32-LABEL: test_mm_roti_epi64:
+; X32: # BB#0:
+; X32-NEXT: vprotq $100, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_roti_epi64:
+; X64: # BB#0:
+; X64-NEXT: vprotq $100, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 100)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_shl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_shl_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpshlq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_shl_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpshlq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %arg0, <16 x i8> %arg1)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %arg0, <8 x i16> %arg1)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %arg0, <4 x i32> %arg1)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_mm_sha_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_sha_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_sha_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu8:
+; X32: # BB#0:
+; X32-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu8:
+; X64: # BB#0:
+; X64-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %arg0, <16 x i8> %arg1, i8 0)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu16:
+; X32: # BB#0:
+; X32-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu16:
+; X64: # BB#0:
+; X64-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %arg0, <8 x i16> %arg1, i8 0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu32:
+; X32: # BB#0:
+; X32-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu32:
+; X64: # BB#0:
+; X64-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %arg0, <4 x i32> %arg1, i8 0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epu64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epu64:
+; X32: # BB#0:
+; X32-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epu64:
+; X64: # BB#0:
+; X64-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi8(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi8:
+; X32: # BB#0:
+; X32-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi8:
+; X64: # BB#0:
+; X64-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
+ %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
+ %res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %arg0, <16 x i8> %arg1, i8 0)
+ %bc = bitcast <16 x i8> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi16(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi16:
+; X32: # BB#0:
+; X32-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi16:
+; X64: # BB#0:
+; X64-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
+ %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
+ %res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %arg0, <8 x i16> %arg1, i8 0)
+ %bc = bitcast <8 x i16> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi32(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi32:
+; X32: # BB#0:
+; X32-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi32:
+; X64: # BB#0:
+; X64-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
+ %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+ %res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %arg0, <4 x i32> %arg1, i8 0)
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_mm_com_epi64(<2 x i64> %a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_com_epi64:
+; X32: # BB#0:
+; X32-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_com_epi64:
+; X64: # BB#0:
+; X64-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <2 x double> @test_mm_permute2_pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_permute2_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute2_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
+
+define <4 x double> @test_mm256_permute2_pd(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_permute2_pd:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2_pd:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
+
+define <4 x float> @test_mm_permute2_ps(<4 x float> %a0, <4 x float> %a1, <2 x i64> %a2) {
+; X32-LABEL: test_mm_permute2_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_permute2_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
+; X64-NEXT: retq
+ %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
+ %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %arg2, i8 0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
+
+define <8 x float> @test_mm256_permute2_ps(<8 x float> %a0, <8 x float> %a1, <4 x i64> %a2) {
+; X32-LABEL: test_mm256_permute2_ps:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_permute2_ps:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
+ %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %arg2, i8 0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
+
+define <4 x float> @test_mm_frcz_ss(<4 x float> %a0) {
+; X32-LABEL: test_mm_frcz_ss:
+; X32: # BB#0:
+; X32-NEXT: vfrczss %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_ss:
+; X64: # BB#0:
+; X64-NEXT: vfrczss %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_frcz_sd(<2 x double> %a0) {
+; X32-LABEL: test_mm_frcz_sd:
+; X32: # BB#0:
+; X32-NEXT: vfrczsd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_sd:
+; X64: # BB#0:
+; X64-NEXT: vfrczsd %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
+
+define <4 x float> @test_mm_frcz_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_frcz_ps:
+; X32: # BB#0:
+; X32-NEXT: vfrczps %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_ps:
+; X64: # BB#0:
+; X64-NEXT: vfrczps %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
+
+define <2 x double> @test_mm_frcz_pd(<2 x double> %a0) {
+; X32-LABEL: test_mm_frcz_pd:
+; X32: # BB#0:
+; X32-NEXT: vfrczpd %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_frcz_pd:
+; X64: # BB#0:
+; X64-NEXT: vfrczpd %xmm0, %xmm0
+; X64-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
+
+define <8 x float> @test_mm256_frcz_ps(<8 x float> %a0) {
+; X32-LABEL: test_mm256_frcz_ps:
+; X32: # BB#0:
+; X32-NEXT: vfrczps %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_frcz_ps:
+; X64: # BB#0:
+; X64-NEXT: vfrczps %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
+
+define <4 x double> @test_mm256_frcz_pd(<4 x double> %a0) {
+; X32-LABEL: test_mm256_frcz_pd:
+; X32: # BB#0:
+; X32-NEXT: vfrczpd %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_frcz_pd:
+; X64: # BB#0:
+; X64-NEXT: vfrczpd %ymm0, %ymm0
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
new file mode 100644
index 000000000000..6fba72f2681b
--- /dev/null
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
@@ -0,0 +1,727 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s
+
+define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1]
+ ret <2 x double> %res
+}
+define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %a1
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1]
+ ret <2 x double> %res
+}
+define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %a2
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1]
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
+ ret <4 x double> %res
+}
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %a1
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
+ ret <4 x double> %res
+}
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %a2
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_int_x86_xop_vpcomeqb_mem(<16 x i8> %a0, <16 x i8>* %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqb_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqb (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %a1
+ %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %vec) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomeqq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomeqq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomequb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomequd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomequq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomequw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomequw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomequw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalsed:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalsed %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalseuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalseuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomfalsew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomfalsew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomged(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomged:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomged %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgeuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgeuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomgtw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomgtw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomleb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomled(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomled:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomled %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomleq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomleub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomleud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomleuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomleuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomlew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomlew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomlew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomltb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomltd(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomltq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomltub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomltud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomltw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomltw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomneb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomned(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomned:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomneq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomneub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomneud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomneuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomnequw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomnew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomnew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomneqw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrued:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrued %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) ;
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtrueuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtrueuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: test_int_x86_xop_vpcomtruew:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomtruew %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) ;
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index 3b4c6ea12107..bb6ef50cdc6c 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -1,649 +1,263 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4,+xop | FileCheck %s
-
-define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: vpermil2pd
- %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s
+
+define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 1) ; [#uses=1]
ret <2 x double> %res
}
-define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
+define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a1
- %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1]
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x i64> %a2, i8 1) ; [#uses=1]
ret <2 x double> %res
}
-define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
- %vec = load <2 x double>, <2 x double>* %a2
- %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1]
+define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %a2
+ %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %vec, i8 1) ; [#uses=1]
ret <2 x double> %res
}
-declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
-define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
- ; CHECK: vpermil2pd
- ; CHECK: ymm
- %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
+define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 2) ;
ret <4 x double> %res
}
-define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
- ; CHECK: ymm
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a1
- %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x i64> %a2, i8 2) ;
ret <4 x double> %res
}
-define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpermil2pd
- ; CHECK: ymm
- %vec = load <4 x double>, <4 x double>* %a2
- %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
+define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x i64>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %a2
+ %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %vec, i8 2) ;
ret <4 x double> %res
}
-declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
-define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: vpermil2ps
- %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
+define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 3) ;
ret <4 x float> %res
}
-declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
-define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: vpermil2ps
- ; CHECK: ymm
- %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
+define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 4) ;
ret <8 x float> %res
}
-declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
- ; CHECK: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-LABEL: test_int_x86_xop_vpcmov:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
- ; CHECK: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
ret <4 x i64> %res
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpcmov
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a1
%res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ;
ret <4 x i64> %res
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpcmov
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a2
%res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ;
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
-define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK:vpcomeqb
- %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-define <16 x i8> @test_int_x86_xop_vpcomeqb_mem(<16 x i8> %a0, <16 x i8>* %a1) {
- ; CHECK-NOT: vmovaps
- ; CHECK:vpcomeqb
- %vec = load <16 x i8>, <16 x i8>* %a1
- %res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %vec) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomeqw
- %res = call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomeqd
- %res = call <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomeqq
- %res = call <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomequb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomequb
- %res = call <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomequd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomequd
- %res = call <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomequq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomequq
- %res = call <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomequw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomequw
- %res = call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomfalseb
- %res = call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomfalsed
- %res = call <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomfalseq
- %res = call <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomfalseub
- %res = call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomfalseud
- %res = call <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomfalseuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomfalseuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomfalsew
- %res = call <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgeb
- %res = call <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomged(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomged
- %res = call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgeq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgeub
- %res = call <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomgeud
- %res = call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgeuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgeuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgew
- %res = call <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgtb
- %res = call <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomgtd
- %res = call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgtq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomgtub
- %res = call <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomgtud
- %res = call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomgtuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgtuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomgtw
- %res = call <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomleb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomleb
- %res = call <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomled(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomled
- %res = call <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomleq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomleq
- %res = call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomleub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomleub
- %res = call <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomleud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomleud
- %res = call <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomleuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomleuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomlew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomlew
- %res = call <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomltb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomltb
- %res = call <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomltd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomltd
- %res = call <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomltq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomltq
- %res = call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomltub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomltub
- %res = call <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomltud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomltud
- %res = call <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomltuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomltuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomltw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomltw
- %res = call <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomneb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomneqb
- %res = call <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomned(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomneqd
- %res = call <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomneq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomneqq
- %res = call <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomneub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomnequb
- %res = call <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomneud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomnequd
- %res = call <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomnequq
- %res = call <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomnequw
- %res = call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomnew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomneqw
- %res = call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomtrueb
- %res = call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomtrued
- %res = call <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomtrueq
- %res = call <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <16 x i8> @test_int_x86_xop_vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpcomtrueub
- %res = call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) ;
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readnone
-
-define <4 x i32> @test_int_x86_xop_vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomtrueud
- %res = call <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomtrueuq
- %res = call <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomtrueuw
- %res = call <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomtruew
- %res = call <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) ;
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
-
define <4 x i32> @test_int_x86_xop_vphaddbd(<16 x i8> %a0) {
- ; CHECK: vphaddbd
+; CHECK-LABEL: test_int_x86_xop_vphaddbd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddbd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddbq(<16 x i8> %a0) {
- ; CHECK: vphaddbq
+; CHECK-LABEL: test_int_x86_xop_vphaddbq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddbq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphaddbw(<16 x i8> %a0) {
- ; CHECK: vphaddbw
+; CHECK-LABEL: test_int_x86_xop_vphaddbw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddbw %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %a0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphadddq(<4 x i32> %a0) {
- ; CHECK: vphadddq
+; CHECK-LABEL: test_int_x86_xop_vphadddq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphadddq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphaddubd(<16 x i8> %a0) {
- ; CHECK: vphaddubd
+; CHECK-LABEL: test_int_x86_xop_vphaddubd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddubd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddubq(<16 x i8> %a0) {
- ; CHECK: vphaddubq
+; CHECK-LABEL: test_int_x86_xop_vphaddubq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddubq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphaddubw(<16 x i8> %a0) {
- ; CHECK: vphaddubw
+; CHECK-LABEL: test_int_x86_xop_vphaddubw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddubw %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %a0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddudq(<4 x i32> %a0) {
- ; CHECK: vphaddudq
+; CHECK-LABEL: test_int_x86_xop_vphaddudq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddudq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphadduwd(<8 x i16> %a0) {
- ; CHECK: vphadduwd
+; CHECK-LABEL: test_int_x86_xop_vphadduwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphadduwd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphadduwq(<8 x i16> %a0) {
- ; CHECK: vphadduwq
+; CHECK-LABEL: test_int_x86_xop_vphadduwq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphadduwq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphaddwd(<8 x i16> %a0) {
- ; CHECK: vphaddwd
+; CHECK-LABEL: test_int_x86_xop_vphaddwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddwd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %a0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddwq(<8 x i16> %a0) {
- ; CHECK: vphaddwq
+; CHECK-LABEL: test_int_x86_xop_vphaddwq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphaddwq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %a0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphsubbw(<16 x i8> %a0) {
- ; CHECK: vphsubbw
+; CHECK-LABEL: test_int_x86_xop_vphsubbw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubbw %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %a0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphsubdq(<4 x i32> %a0) {
- ; CHECK: vphsubdq
+; CHECK-LABEL: test_int_x86_xop_vphsubdq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubdq %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %a0) ;
ret <2 x i64> %res
}
define <2 x i64> @test_int_x86_xop_vphsubdq_mem(<4 x i32>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vphsubdq
+; CHECK-LABEL: test_int_x86_xop_vphsubdq_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubdq (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %a0
%res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %vec) ;
ret <2 x i64> %res
@@ -651,13 +265,18 @@ define <2 x i64> @test_int_x86_xop_vphsubdq_mem(<4 x i32>* %a0) {
declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphsubwd(<8 x i16> %a0) {
- ; CHECK: vphsubwd
+; CHECK-LABEL: test_int_x86_xop_vphsubwd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubwd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %a0) ;
ret <4 x i32> %res
}
define <4 x i32> @test_int_x86_xop_vphsubwd_mem(<8 x i16>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vphsubwd
+; CHECK-LABEL: test_int_x86_xop_vphsubwd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vphsubwd (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a0
%res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %vec) ;
ret <4 x i32> %res
@@ -665,90 +284,128 @@ define <4 x i32> @test_int_x86_xop_vphsubwd_mem(<8 x i16>* %a0) {
declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacsdd
+; CHECK-LABEL: test_int_x86_xop_vpmacsdd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacsdqh
+; CHECK-LABEL: test_int_x86_xop_vpmacsdqh:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacsdql
+; CHECK-LABEL: test_int_x86_xop_vpmacsdql:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacssdd
+; CHECK-LABEL: test_int_x86_xop_vpmacssdd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacssdqh
+; CHECK-LABEL: test_int_x86_xop_vpmacssdqh:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
- ; CHECK: vpmacssdql
+; CHECK-LABEL: test_int_x86_xop_vpmacssdql:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacsswd
+; CHECK-LABEL: test_int_x86_xop_vpmacsswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
- ; CHECK: vpmacssww
+; CHECK-LABEL: test_int_x86_xop_vpmacssww:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmacswd
+; CHECK-LABEL: test_int_x86_xop_vpmacswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
- ; CHECK: vpmacsww
+; CHECK-LABEL: test_int_x86_xop_vpmacsww:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmadcsswd
+; CHECK-LABEL: test_int_x86_xop_vpmadcsswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
- ; CHECK: vpmadcswd
+; CHECK-LABEL: test_int_x86_xop_vpmadcswd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
ret <4 x i32> %res
}
define <4 x i32> @test_int_x86_xop_vpmadcswd_mem(<8 x i16> %a0, <8 x i16>* %a1, <4 x i32> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpmadcswd
+; CHECK-LABEL: test_int_x86_xop_vpmadcswd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a1
%res = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %vec, <4 x i32> %a2) ;
ret <4 x i32> %res
@@ -756,20 +413,27 @@ define <4 x i32> @test_int_x86_xop_vpmadcswd_mem(<8 x i16> %a0, <8 x i16>* %a1,
declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
- ; CHECK: vpperm
+; CHECK-LABEL: test_int_x86_xop_vpperm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ;
ret <16 x i8> %res
}
define <16 x i8> @test_int_x86_xop_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpperm
+; CHECK-LABEL: test_int_x86_xop_vpperm_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %a2
%res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %vec) ;
ret <16 x i8> %res
}
define <16 x i8> @test_int_x86_xop_vpperm_mr(<16 x i8> %a0, <16 x i8>* %a1, <16 x i8> %a2) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpperm
+; CHECK-LABEL: test_int_x86_xop_vpperm_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpperm %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %a1
%res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %vec, <16 x i8> %a2) ;
ret <16 x i8> %res
@@ -777,125 +441,177 @@ define <16 x i8> @test_int_x86_xop_vpperm_mr(<16 x i8> %a0, <16 x i8>* %a1, <16
declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vprotb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vprotb
+; CHECK-LABEL: test_int_x86_xop_vprotb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a0, <16 x i8> %a1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vprotd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vprotd
+; CHECK-LABEL: test_int_x86_xop_vprotd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a0, <4 x i32> %a1) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vprotq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vprotq
+; CHECK-LABEL: test_int_x86_xop_vprotq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vprotw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vprotw
+; CHECK-LABEL: test_int_x86_xop_vprotw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a0, <8 x i16> %a1) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vprotbi(<16 x i8> %a0) {
- ; CHECK: vprotb
+; CHECK-LABEL: test_int_x86_xop_vprotbi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotb $1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %a0, i8 1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
define <4 x i32> @test_int_x86_xop_vprotdi(<4 x i32> %a0) {
- ; CHECK: vprotd
+; CHECK-LABEL: test_int_x86_xop_vprotdi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotd $254, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %a0, i8 -2) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vprotqi(<2 x i64> %a0) {
- ; CHECK: vprotq
+; CHECK-LABEL: test_int_x86_xop_vprotqi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotq $3, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 3) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
define <8 x i16> @test_int_x86_xop_vprotwi(<8 x i16> %a0) {
- ; CHECK: vprotw
+; CHECK-LABEL: test_int_x86_xop_vprotwi:
+; CHECK: # BB#0:
+; CHECK-NEXT: vprotw $252, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %a0, i8 -4) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpshab
+; CHECK-LABEL: test_int_x86_xop_vpshab:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a0, <16 x i8> %a1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpshad(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpshad
+; CHECK-LABEL: test_int_x86_xop_vpshad:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a0, <4 x i32> %a1) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpshaq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpshaq
+; CHECK-LABEL: test_int_x86_xop_vpshaq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpshaw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpshaw
+; CHECK-LABEL: test_int_x86_xop_vpshaw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a0, <8 x i16> %a1) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpshlb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK: vpshlb
+; CHECK-LABEL: test_int_x86_xop_vpshlb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a0, <16 x i8> %a1) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpshld(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpshld
+; CHECK-LABEL: test_int_x86_xop_vpshld:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a0, <4 x i32> %a1) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpshlq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpshlq
+; CHECK-LABEL: test_int_x86_xop_vpshlq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpshlw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpshlw
+; CHECK-LABEL: test_int_x86_xop_vpshlw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %a1) ;
ret <8 x i16> %res
}
define <8 x i16> @test_int_x86_xop_vpshlw_rm(<8 x i16> %a0, <8 x i16>* %a1) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpshlw
+; CHECK-LABEL: test_int_x86_xop_vpshlw_rm:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlw (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a1
%res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %vec) ;
ret <8 x i16> %res
}
define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vpshlw
+; CHECK-LABEL: test_int_x86_xop_vpshlw_mr:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpshlw %xmm0, (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a0
%res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %vec, <8 x i16> %a1) ;
ret <8 x i16> %res
@@ -903,14 +619,18 @@ define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczss
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ss:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczss %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0) ;
ret <4 x float> %res
}
define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczss
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ss_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczss (%rdi), %xmm0
+; CHECK-NEXT: retq
%elem = load float, float* %a0
%vec = insertelement <4 x float> undef, float %elem, i32 0
%res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %vec) ;
@@ -919,14 +639,18 @@ define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczsd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_sd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczsd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0) ;
ret <2 x double> %res
}
define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
- ; CHECK-NOT: mov
- ; CHECK: vfrczsd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_sd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczsd (%rdi), %xmm0
+; CHECK-NEXT: retq
%elem = load double, double* %a0
%vec = insertelement <2 x double> undef, double %elem, i32 0
%res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %vec) ;
@@ -935,13 +659,18 @@ define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
define <2 x double> @test_int_x86_xop_vfrcz_pd(<2 x double> %a0) {
- ; CHECK: vfrczpd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0) ;
ret <2 x double> %res
}
define <2 x double> @test_int_x86_xop_vfrcz_pd_mem(<2 x double>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczpd
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a0
%res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %vec) ;
ret <2 x double> %res
@@ -949,15 +678,18 @@ define <2 x double> @test_int_x86_xop_vfrcz_pd_mem(<2 x double>* %a0) {
declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
define <4 x double> @test_int_x86_xop_vfrcz_pd_256(<4 x double> %a0) {
- ; CHECK: vfrczpd
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd %ymm0, %ymm0
+; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0) ;
ret <4 x double> %res
}
define <4 x double> @test_int_x86_xop_vfrcz_pd_256_mem(<4 x double>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczpd
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_256_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczpd (%rdi), %ymm0
+; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a0
%res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %vec) ;
ret <4 x double> %res
@@ -965,13 +697,18 @@ define <4 x double> @test_int_x86_xop_vfrcz_pd_256_mem(<4 x double>* %a0) {
declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone
define <4 x float> @test_int_x86_xop_vfrcz_ps(<4 x float> %a0) {
- ; CHECK: vfrczps
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0) ;
ret <4 x float> %res
}
define <4 x float> @test_int_x86_xop_vfrcz_ps_mem(<4 x float>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczps
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps (%rdi), %xmm0
+; CHECK-NEXT: retq
%vec = load <4 x float>, <4 x float>* %a0
%res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %vec) ;
ret <4 x float> %res
@@ -979,15 +716,18 @@ define <4 x float> @test_int_x86_xop_vfrcz_ps_mem(<4 x float>* %a0) {
declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
define <8 x float> @test_int_x86_xop_vfrcz_ps_256(<8 x float> %a0) {
- ; CHECK: vfrczps
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps %ymm0, %ymm0
+; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0) ;
ret <8 x float> %res
}
define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
- ; CHECK-NOT: vmovaps
- ; CHECK: vfrczps
- ; CHECK: ymm
+; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_256_mem:
+; CHECK: # BB#0:
+; CHECK-NEXT: vfrczps (%rdi), %ymm0
+; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %a0
%res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %vec) ;
ret <8 x float> %res
@@ -995,56 +735,80 @@ define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK:vpcomb
+; CHECK-LABEL: test_int_x86_xop_vpcomb:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomw
+; CHECK-LABEL: test_int_x86_xop_vpcomw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomd
+; CHECK-LABEL: test_int_x86_xop_vpcomd:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomq
+; CHECK-LABEL: test_int_x86_xop_vpcomq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
- ; CHECK:vpcomub
+; CHECK-LABEL: test_int_x86_xop_vpcomub:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
- ; CHECK: vpcomuw
+; CHECK-LABEL: test_int_x86_xop_vpcomuw:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vpcomud
+; CHECK-LABEL: test_int_x86_xop_vpcomud:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
ret <4 x i32> %res
}
declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
- ; CHECK: vpcomuq
+; CHECK-LABEL: test_int_x86_xop_vpcomuq:
+; CHECK: # BB#0:
+; CHECK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
ret <2 x i64> %res
}
diff --git a/test/CodeGen/X86/xop-mask-comments.ll b/test/CodeGen/X86/xop-mask-comments.ll
new file mode 100644
index 000000000000..e4cc9101777d
--- /dev/null
+++ b/test/CodeGen/X86/xop-mask-comments.ll
@@ -0,0 +1,188 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X64
+
+;
+; VPPERM
+;
+
+define <16 x i8> @vpperm_shuffle_unary(<16 x i8> %a0) {
+; X32-LABEL: vpperm_shuffle_unary:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_unary:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_unary_undef(<16 x i8> %a0) {
+; X32-LABEL: vpperm_shuffle_unary_undef:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_unary_undef:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> undef, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_unary_zero(<16 x i8> %a0) {
+; X32-LABEL: vpperm_shuffle_unary_zero:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3],zero,xmm0[1],zero
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_unary_zero:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3],zero,xmm0[1],zero
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 130, i8 17, i8 128>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_binary(<16 x i8> %a0, <16 x i8> %a1) {
+; X32-LABEL: vpperm_shuffle_binary:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],xmm1[3],xmm0[2],xmm1[1],xmm0[0]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_binary:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],xmm1[3],xmm0[2],xmm1[1],xmm0[0]
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
+ ret <16 x i8> %1
+}
+
+define <16 x i8> @vpperm_shuffle_binary_zero(<16 x i8> %a0, <16 x i8> %a1) {
+; X32-LABEL: vpperm_shuffle_binary_zero:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],zero,zero,zero,zero
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_binary_zero:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],zero,zero,zero,zero
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 147, i8 130, i8 145, i8 128>)
+ ret <16 x i8> %1
+}
+
+; we can't decode vpperm's other permute ops
+define <16 x i8> @vpperm_shuffle_general(<16 x i8> %a0, <16 x i8> %a1) {
+; X32-LABEL: vpperm_shuffle_general:
+; X32: # BB#0:
+; X32-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: vpperm_shuffle_general:
+; X64: # BB#0:
+; X64-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
+; X64-NEXT: retq
+ %1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 179, i8 162, i8 177, i8 160>)
+ ret <16 x i8> %1
+}
+
+;
+; VPERMIL2
+;
+
+define <2 x double> @vpermil2pd_21(<2 x double> %a0, <2 x double> %a1) {
+; X32-LABEL: vpermil2pd_21:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2pd_21:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-NEXT: retq
+ %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> <i64 4, i64 2>, i8 0)
+ ret <2 x double> %1
+}
+
+define <4 x double> @vpermil2pd256_0062(<4 x double> %a0, <4 x double> %a1) {
+; X32-LABEL: vpermil2pd256_0062:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0,0],ymm1[2],ymm0[2]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2pd256_0062:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0,0],ymm1[2],ymm0[2]
+; X64-NEXT: retq
+ %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 0, i64 4, i64 0>, i8 0)
+ ret <4 x double> %1
+}
+
+define <4 x double> @vpermil2pd256_zz73(<4 x double> %a0, <4 x double> %a1) {
+; X32-LABEL: vpermil2pd256_zz73:
+; X32: # BB#0:
+; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = zero,zero,ymm1[3],ymm0[3]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2pd256_zz73:
+; X64: # BB#0:
+; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = zero,zero,ymm1[3],ymm0[3]
+; X64-NEXT: retq
+ %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 0, i64 14, i64 10>, i8 3)
+ ret <4 x double> %1
+}
+
+define <4 x float> @vpermil2ps_0561(<4 x float> %a0, <4 x float> %a1) {
+; X32-LABEL: vpermil2ps_0561:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[1]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2ps_0561:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[1]
+; X64-NEXT: retq
+ %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 1>, i8 0)
+ ret <4 x float> %1
+}
+
+define <8 x float> @vpermil2ps256_098144FE(<8 x float> %a0, <8 x float> %a1) {
+; X32-LABEL: vpermil2ps256_098144FE:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[1,0],ymm0[1,4,4],ymm1[7,6]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2ps256_098144FE:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[1,0],ymm0[1,4,4],ymm1[7,6]
+; X64-NEXT: retq
+ %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 5, i32 4, i32 1, i32 0, i32 0, i32 7, i32 6>, i8 0)
+ ret <8 x float> %1
+}
+
+define <8 x float> @vpermil2ps256_0zz8BzzA(<8 x float> %a0, <8 x float> %a1) {
+; X32-LABEL: vpermil2ps256_0zz8BzzA:
+; X32: # BB#0:
+; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],zero,zero,ymm1[0,7],zero,zero,ymm1[6]
+; X32-NEXT: retl
+;
+; X64-LABEL: vpermil2ps256_0zz8BzzA:
+; X64: # BB#0:
+; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],zero,zero,ymm1[0,7],zero,zero,ymm1[6]
+; X64-NEXT: retq
+ %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 8, i32 4, i32 7, i32 8, i32 8, i32 6>, i8 2)
+ ret <8 x float> %1
+}
+
+declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
+declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
+
+declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
+declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
diff --git a/test/CodeGen/X86/xray-attribute-instrumentation.ll b/test/CodeGen/X86/xray-attribute-instrumentation.ll
new file mode 100644
index 000000000000..9e2d8934e98f
--- /dev/null
+++ b/test/CodeGen/X86/xray-attribute-instrumentation.ll
@@ -0,0 +1,13 @@
+; RUN: llc -filetype=asm -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" {
+; CHECK-LABEL: Lxray_sled_0:
+; CHECK-NEXT: .p2align 2, 0x90
+; CHECK-NEXT: .ascii "\353\t"
+; CHECK-NEXT: nopw 512(%rax,%rax)
+; CHECK-LABEL: Ltmp0:
+ ret i32 0
+; CHECK-LABEL: Lxray_sled_1:
+; CHECK-NEXT: retq
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+}
diff --git a/test/CodeGen/X86/xray-selective-instrumentation-miss.ll b/test/CodeGen/X86/xray-selective-instrumentation-miss.ll
new file mode 100644
index 000000000000..5b57e2541156
--- /dev/null
+++ b/test/CodeGen/X86/xray-selective-instrumentation-miss.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mcpu=nehalem < %s | not grep xray_sled_
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-apple-darwin8"
+
+define i32 @foo() nounwind uwtable "xray-instruction-threshold"="3" {
+entry:
+ ret i32 0
+}
diff --git a/test/CodeGen/X86/xray-selective-instrumentation.ll b/test/CodeGen/X86/xray-selective-instrumentation.ll
new file mode 100644
index 000000000000..4368161a2b30
--- /dev/null
+++ b/test/CodeGen/X86/xray-selective-instrumentation.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mcpu=nehalem < %s | grep xray_sled_
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-apple-darwin8"
+
+define i32 @foo() nounwind uwtable "xray-instruction-threshold"="1" {
+entry:
+ ret i32 0
+}
diff --git a/test/CodeGen/X86/zext-fold.ll b/test/CodeGen/X86/zext-fold.ll
index a10923f7a80f..6aca4f40f0aa 100644
--- a/test/CodeGen/X86/zext-fold.ll
+++ b/test/CodeGen/X86/zext-fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -enable-misched=false | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-linux -enable-misched=false | FileCheck %s
;; Simple case
define i32 @test1(i8 %x) nounwind readnone {
@@ -35,7 +35,8 @@ define void @test3(i8 %x) nounwind readnone {
}
; CHECK: test3
; CHECK: movzbl {{[0-9]+}}(%esp), [[REGISTER:%e[a-z]{2}]]
-; CHECK-NEXT: movl [[REGISTER]], 4(%esp)
+; CHECK: subl $8, %esp
+; CHECK-NEXT: pushl [[REGISTER]]
; CHECK-NEXT: andl $224, [[REGISTER]]
-; CHECK-NEXT: movl [[REGISTER]], (%esp)
+; CHECK-NEXT: pushl [[REGISTER]]
; CHECK-NEXT: call{{.*}}use